From 0cac331e6e525851b054db3a9932154e12cb2fcd Mon Sep 17 00:00:00 2001 From: CarolineConcatto <51754594+CarolineConcatto@users.noreply.github.com> Date: Tue, 12 Sep 2023 17:31:51 +0100 Subject: [PATCH 01/77] [AArch64][SME]Update intrinsic interface for ldr/str (#65593) The new ACLE PR#225[1] now combines the slice parameters for some builtins. [1]https://github.com/ARM-software/acle/pull/225/files Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 16 +++++++++--- clang/lib/CodeGen/CGBuiltin.cpp | 25 +++++++++++-------- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 13 ++++++++-- .../aarch64-sme-intrinsics/acle_sme_str.c | 14 +++++++++-- .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 8 +++--- 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index b950f5cb8acc..bcb1d3b27db7 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -44,10 +44,14 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>; -def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmiQ", "", +def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQi", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], MemEltTyDefault, "aarch64_sme_ldr", - [ImmCheck<1, ImmCheck0_15>]>; + [ImmCheck<2, ImmCheck0_15>]>; + +def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", + [IsOverloadNone, IsStreamingCompatible, IsSharedZA], + MemEltTyDefault, "aarch64_sme_ldr", []>; //////////////////////////////////////////////////////////////////////////////// // Stores @@ -78,10 +82,14 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>; -def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vmi%", "", +def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%i", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], MemEltTyDefault, "aarch64_sme_str", - [ImmCheck<1, ImmCheck0_15>]>; + [ImmCheck<2, ImmCheck0_15>]>; + +def SVSTR_ZA : MInst<"svstr_za", "vm%", "", + [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], + MemEltTyDefault, "aarch64_sme_str", []>; //////////////////////////////////////////////////////////////////////////////// // Read horizontal/vertical ZA slices diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 8f87c4d46109..d8690bb2f14a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9521,15 +9521,18 @@ Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags, Value *CodeGenFunction::EmitSMELdrStr(SVETypeFlags TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { - Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - llvm::Value *MulVL = Builder.CreateMul( - CntsbCall, - Builder.getInt64(cast(Ops[1])->getZExtValue()), - "mulvl"); - Ops[2] = Builder.CreateGEP(Int8Ty, Ops[2], MulVL); - Ops[0] = EmitTileslice(Ops[1], Ops[0]); - Ops.erase(&Ops[1]); + if (Ops.size() == 3) { + Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); + llvm::Value *MulVL = Builder.CreateMul( + CntsbCall, + Builder.getInt64(cast(Ops[2])->getZExtValue()), + "mulvl"); + + Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); + Ops[0] = EmitTileslice(Ops[0], Ops[2]); + Ops.erase(&Ops[2]); + } Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } @@ -9999,7 +10002,9 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, BuiltinID == SME::BI__builtin_sme_svzero_za) return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za || - BuiltinID == SME::BI__builtin_sme_svstr_vnum_za) + BuiltinID == SME::BI__builtin_sme_svstr_vnum_za || + BuiltinID == SME::BI__builtin_sme_svldr_za || + BuiltinID == SME::BI__builtin_sme_svstr_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (Builtin->LLVMIntrinsic != 0) { // Predicates must match the main datatype. diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 7efa8b155685..acddc2ef50a3 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -12,7 +12,7 @@ // CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { - svldr_vnum_za(slice_base, 0, ptr); + svldr_vnum_za(slice_base, ptr, 0); } // CHECK-C-LABEL: @test_svldr_vnum_za_1( @@ -26,5 +26,14 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { // CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { - svldr_vnum_za(slice_base, 15, ptr); + svldr_vnum_za(slice_base, ptr, 15); +} + +// CHECK-C-LABEL: @test_svldr_za( +// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: ret void +void test_svldr_za(uint32_t slice_base, const void *ptr) { + svldr_za(slice_base, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 12aa298858a1..2728f9ac0cd1 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -12,7 +12,7 @@ // CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { - svstr_vnum_za(slice_base, 0, ptr); + svstr_vnum_za(slice_base, ptr, 0); } // CHECK-C-LABEL: @test_svstr_vnum_za_1( @@ -26,5 +26,15 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { // CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { - svstr_vnum_za(slice_base, 15, ptr); + svstr_vnum_za(slice_base, ptr, 15); +} + +// CHECK-C-LABEL: @test_svstr_za( +// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: ret void +// +void test_svstr_za(uint32_t slice_base, void *ptr) { + svstr_za(slice_base, ptr); } diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index c08f9f25eb47..ca5bc1454d3d 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -211,9 +211,9 @@ void test_range_0_15(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(0, -1, 16, pg, ptr, 1); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, 16, ptr); + SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, ptr, 16); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, -1, ptr); + SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, ptr, -1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, -1, 0); @@ -250,8 +250,8 @@ void test_constant(uint64_t u64, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}} SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(0, u64, u64, pg, ptr, u64); // expected-error {{argument to 'svst1_ver_vnum_za64' must be a constant integer}} - SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, u64, ptr); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}} - SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, u64, ptr); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}} + SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}} + SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}} SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, 0, u64, u64); // expected-error-re {{argument to 'svread_hor_za8{{.*}}_m' must be a constant integer}} SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}} -- Gitee From cb1075216328da89377cd78d17b3db6cb3b40d5e Mon Sep 17 00:00:00 2001 From: CarolineConcatto <51754594+CarolineConcatto@users.noreply.github.com> Date: Tue, 12 Sep 2023 18:08:57 +0100 Subject: [PATCH 02/77] [AArch64][SME]Update intrinsic interface for read/write (#65594) The new ACLE PR#225[1] now combines the slice parameters for some builtins. This patch is the #2 of 3 patches to update the interface. Slice specifies the ZA slice number directly and needs to be explicity implemented by the "user" with the base register plus the immediate offset [1]https://github.com/ARM-software/acle/pull/225/files Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 28 +- clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../aarch64-sme-intrinsics/acle_sme_read.c | 256 ++++++++++-------- .../aarch64-sme-intrinsics/acle_sme_write.c | 216 ++++++++------- .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 96 ++----- 5 files changed, 304 insertions(+), 303 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index bcb1d3b27db7..ff159471df90 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -96,42 +96,42 @@ def SVSTR_ZA : MInst<"svstr_za", "vm%", "", multiclass ZARead ch> { let TargetGuard = "sme" in { - def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPimi", t, + def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t, MergeOp1, i_prefix # "_horiz", [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; - def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPimi", t, + def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t, MergeOp1, i_prefix # "_vert", [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; } } -defm SVREAD_ZA8 : ZARead<"za8", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_0>, ImmCheck<4, ImmCheck0_15>]>; -defm SVREAD_ZA16 : ZARead<"za16", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>, ImmCheck<4, ImmCheck0_7>]>; -defm SVREAD_ZA32 : ZARead<"za32", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>; -defm SVREAD_ZA64 : ZARead<"za64", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; -defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>, ImmCheck<4, ImmCheck0_0>]>; +defm SVREAD_ZA8 : ZARead<"za8", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_0>]>; +defm SVREAD_ZA16 : ZARead<"za16", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>]>; +defm SVREAD_ZA32 : ZARead<"za32", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>]>; +defm SVREAD_ZA64 : ZARead<"za64", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>]>; +defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>]>; //////////////////////////////////////////////////////////////////////////////// // Write horizontal/vertical ZA slices multiclass ZAWrite ch> { let TargetGuard = "sme" in { - def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimiPd", t, + def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t, MergeOp1, i_prefix # "_horiz", [IsWriteZA, IsStreaming, IsSharedZA], ch>; - def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimiPd", t, + def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t, MergeOp1, i_prefix # "_vert", [IsWriteZA, IsStreaming, IsSharedZA], ch>; } } -defm SVWRITE_ZA8 : ZAWrite<"za8", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>; -defm SVWRITE_ZA16 : ZAWrite<"za16", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; -defm SVWRITE_ZA32 : ZAWrite<"za32", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; -defm SVWRITE_ZA64 : ZAWrite<"za64", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; -defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>; +defm SVWRITE_ZA8 : ZAWrite<"za8", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>; +defm SVWRITE_ZA16 : ZAWrite<"za16", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>]>; +defm SVWRITE_ZA32 : ZAWrite<"za32", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>]>; +defm SVWRITE_ZA64 : ZAWrite<"za64", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>]>; +defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>]>; //////////////////////////////////////////////////////////////////////////////// // SME - Zero diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d8690bb2f14a..bcad4218a573 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9496,15 +9496,10 @@ Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags, unsigned IntID) { auto *VecTy = getSVEType(TypeFlags); Function *F = CGM.getIntrinsic(IntID, VecTy); - if (TypeFlags.isReadZA()) { + if (TypeFlags.isReadZA()) Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy); - Ops[3] = EmitTileslice(Ops[4], Ops[3]); - Ops.erase(&Ops[4]); - } else if (TypeFlags.isWriteZA()) { - Ops[1] = EmitTileslice(Ops[2], Ops[1]); - Ops[2] = EmitSVEPredicateCast(Ops[3], VecTy); - Ops.erase(&Ops[3]); - } + else if (TypeFlags.isWriteZA()) + Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index 19e2b42e13f2..f7a0852387e8 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -20,7 +20,7 @@ // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za8_s8_1( @@ -31,7 +31,8 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 15); + uint32_t slice = slice_base + 15; + return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice); } // CHECK-C-LABEL: @test_svread_hor_za16_s16( @@ -42,19 +43,20 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za16_s16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_hor_za32_s32( @@ -65,19 +67,20 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za32_s32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_hor_za64_s64( @@ -88,19 +91,20 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za64_s64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_hor_za8_u8( @@ -110,7 +114,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za8_u8_1( @@ -121,7 +125,8 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 15); + uint32_t slice = slice_base + 15; + return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice); } // CHECK-C-LABEL: @test_svread_hor_za16_u16( @@ -132,19 +137,20 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za16_u16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_hor_za32_u32( @@ -155,19 +161,20 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za32_u32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_hor_za64_u64( @@ -178,19 +185,20 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za64_u64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_hor_za16_f16( @@ -201,19 +209,20 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za16_f16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_hor_za16_bf16( @@ -224,19 +233,20 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za16_bf16_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_hor_za32_f32( @@ -247,19 +257,20 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za32_f32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_hor_za64_f64( @@ -270,19 +281,20 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za64_f64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_hor_za128_s8( @@ -292,7 +304,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s8_1( @@ -302,7 +314,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s16( @@ -313,7 +325,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s16_1( @@ -324,7 +336,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s32( @@ -335,7 +347,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s32_1( @@ -346,7 +358,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s64( @@ -357,7 +369,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_s64_1( @@ -368,7 +380,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u8( @@ -378,7 +390,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u8_1( @@ -388,7 +400,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u16( @@ -399,7 +411,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u16_1( @@ -410,7 +422,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u32( @@ -421,7 +433,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u32_1( @@ -432,7 +444,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u64( @@ -443,7 +455,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_u64_1( @@ -454,7 +466,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f16( @@ -465,7 +477,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f16_1( @@ -476,7 +488,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_bf16( @@ -487,7 +499,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_bf16_1( @@ -498,7 +510,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f32( @@ -509,7 +521,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f32_1( @@ -520,7 +532,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f64( @@ -531,7 +543,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_hor_za128_f64_1( @@ -542,7 +554,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za8_s8( @@ -552,7 +564,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za8_s8_1( @@ -563,7 +575,8 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 15); + uint32_t slice = slice_base + 15; + return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice); } // CHECK-C-LABEL: @test_svread_ver_za16_s16( @@ -574,19 +587,20 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za16_s16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_ver_za32_s32( @@ -597,19 +611,20 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za32_s32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_ver_za64_s64( @@ -620,19 +635,20 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za64_s64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_ver_za8_u8( @@ -642,7 +658,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za8_u8_1( @@ -653,7 +669,8 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 15); + uint32_t slice = slice_base + 15; + return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice); } // CHECK-C-LABEL: @test_svread_ver_za16_u16( @@ -664,19 +681,20 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za16_u16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_ver_za32_u32( @@ -687,19 +705,20 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za32_u32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_ver_za64_u64( @@ -710,19 +729,20 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za64_u64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_ver_za16_f16( @@ -733,19 +753,20 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za16_f16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_ver_za16_bf16( @@ -756,19 +777,20 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za16_bf16_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice_base, 7); + uint32_t slice = slice_base + 7; + return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice); } // CHECK-C-LABEL: @test_svread_ver_za32_f32( @@ -779,19 +801,20 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za32_f32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice_base, 3); + uint32_t slice = slice_base + 3; + return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice); } // CHECK-C-LABEL: @test_svread_ver_za64_f64( @@ -802,19 +825,20 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za64_f64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice_base, 1); + uint32_t slice = slice_base + 1; + return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice); } // CHECK-C-LABEL: @test_svread_ver_za128_s8( @@ -824,7 +848,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s8_1( @@ -834,7 +858,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: ret [[TMP0]] // svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s16( @@ -845,7 +869,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s16_1( @@ -856,7 +880,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s32( @@ -867,7 +891,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s32_1( @@ -878,7 +902,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s64( @@ -889,7 +913,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_s64_1( @@ -900,7 +924,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: ret [[TMP1]] // svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u8( @@ -910,7 +934,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u8_1( @@ -920,7 +944,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: ret [[TMP0]] // svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u16( @@ -931,7 +955,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u16_1( @@ -942,7 +966,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u32( @@ -953,7 +977,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u32_1( @@ -964,7 +988,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u64( @@ -975,7 +999,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_u64_1( @@ -986,7 +1010,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: ret [[TMP1]] // svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f16( @@ -997,7 +1021,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f16_1( @@ -1008,7 +1032,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_bf16( @@ -1019,7 +1043,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_bf16_1( @@ -1030,7 +1054,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: ret [[TMP1]] // svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f32( @@ -1041,7 +1065,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f32_1( @@ -1052,7 +1076,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f64( @@ -1063,7 +1087,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base); } // CHECK-C-LABEL: @test_svread_ver_za128_f64_1( @@ -1074,5 +1098,5 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: ret [[TMP1]] // svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { - return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base, 0); + return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index 66e4550294ac..395918b936b3 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -20,7 +20,7 @@ // CHECK-NEXT: ret void // void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za8_s8_1( @@ -31,7 +31,8 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 15, pg, zn); + uint32_t slice = slice_base + 15; + SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_s16( @@ -42,7 +43,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_s16_1( @@ -54,7 +55,8 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_s32( @@ -65,7 +67,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_s32_1( @@ -77,7 +79,8 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_s64( @@ -88,7 +91,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_s64_1( @@ -100,7 +103,8 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za8_u8( @@ -110,7 +114,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za8_u8_1( @@ -121,7 +125,8 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 15, pg, zn); + uint32_t slice = slice_base + 15; + SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_u16( @@ -132,7 +137,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_u16_1( @@ -144,7 +149,8 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_u32( @@ -155,7 +161,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_u32_1( @@ -167,7 +173,8 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_u64( @@ -178,7 +185,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_u64_1( @@ -190,7 +197,8 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_f16( @@ -201,7 +209,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_f16_1( @@ -213,7 +221,8 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16( @@ -224,7 +233,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16_1( @@ -236,7 +245,8 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_f32( @@ -247,7 +257,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: ret void // void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za32_f32_1( @@ -259,7 +269,8 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_f64( @@ -270,7 +281,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za64_f64_1( @@ -282,7 +293,8 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s8( @@ -292,7 +304,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s8_1( @@ -302,7 +314,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s16( @@ -313,7 +325,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s16_1( @@ -324,7 +336,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s32( @@ -335,7 +347,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s32_1( @@ -346,7 +358,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s64( @@ -357,7 +369,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_s64_1( @@ -368,7 +380,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u8( @@ -378,7 +390,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u8_1( @@ -388,7 +400,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u16( @@ -399,7 +411,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u16_1( @@ -410,7 +422,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u32( @@ -421,7 +433,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u32_1( @@ -432,7 +444,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u64( @@ -443,7 +455,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_u64_1( @@ -454,7 +466,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f16( @@ -465,7 +477,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f16_1( @@ -476,7 +488,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16( @@ -487,7 +499,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: ret void // void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16_1( @@ -498,7 +510,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: ret void // void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f32( @@ -509,7 +521,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f32_1( @@ -520,7 +532,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f64( @@ -531,7 +543,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_hor_za128_f64_1( @@ -542,7 +554,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: ret void // void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za8_s8( @@ -552,7 +564,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t // CHECK-NEXT: ret void // void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za8_s8_1( @@ -563,7 +575,8 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 15, pg, zn); + uint32_t slice = slice_base + 15; + SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_s16( @@ -574,7 +587,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_s16_1( @@ -586,7 +599,8 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_s32( @@ -597,7 +611,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_s32_1( @@ -609,7 +623,8 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_s64( @@ -620,7 +635,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_s64_1( @@ -632,7 +647,8 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za8_u8( @@ -642,7 +658,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za8_u8_1( @@ -653,7 +669,8 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 15, pg, zn); + uint32_t slice = slice_base + 15; + SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_u16( @@ -664,7 +681,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_u16_1( @@ -676,7 +693,8 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_u32( @@ -687,7 +705,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_u32_1( @@ -699,7 +717,8 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_u64( @@ -710,7 +729,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_u64_1( @@ -722,7 +741,8 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_f16( @@ -733,7 +753,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_f16_1( @@ -745,7 +765,8 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16( @@ -756,7 +777,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16_1( @@ -768,7 +789,8 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice_base, 7, pg, zn); + uint32_t slice = slice_base + 7; + SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_f32( @@ -779,7 +801,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: ret void // void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za32_f32_1( @@ -791,7 +813,8 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice_base, 3, pg, zn); + uint32_t slice = slice_base + 3; + SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_f64( @@ -802,7 +825,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za64_f64_1( @@ -814,7 +837,8 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice_base, 1, pg, zn); + uint32_t slice = slice_base + 1; + SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s8( @@ -824,7 +848,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s8_1( @@ -834,7 +858,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s16( @@ -845,7 +869,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s16_1( @@ -856,7 +880,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s32( @@ -867,7 +891,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s32_1( @@ -878,7 +902,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s64( @@ -889,7 +913,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_s64_1( @@ -900,7 +924,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u8( @@ -910,7 +934,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u8_1( @@ -920,7 +944,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u16( @@ -931,7 +955,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u16_1( @@ -942,7 +966,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u32( @@ -953,7 +977,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u32_1( @@ -964,7 +988,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u64( @@ -975,7 +999,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_u64_1( @@ -986,7 +1010,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: ret void // void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f16( @@ -997,7 +1021,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f16_1( @@ -1008,7 +1032,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16( @@ -1019,7 +1043,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: ret void // void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16_1( @@ -1030,7 +1054,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: ret void // void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f32( @@ -1041,7 +1065,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f32_1( @@ -1052,7 +1076,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f64( @@ -1063,7 +1087,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn); } // CHECK-C-LABEL: @test_svwrite_ver_za128_f64_1( @@ -1074,5 +1098,5 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: ret void // void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { - SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, 0, pg, zn); + SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn); } diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index ca5bc1454d3d..3316c28cebdb 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -12,7 +12,7 @@ #include -void test_range_0_0(svbool_t pg, void *ptr) { +void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, -1, 0, pg, ptr); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} @@ -31,24 +31,16 @@ void test_range_0_0(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(15, -1, 1, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, -1, -1, 0); + SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, -1, slice); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 1, -1, 15); + SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 1, slice); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, 0, -1, -1); + SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(-1, slice, pg, svundef_s8()); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 15, -1, 1); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(-1, -1, 0, pg, svundef_s8()); - // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, -1, 15, pg, svundef_s8()); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(0, -1, -1, pg, svundef_s8()); - // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(15, -1, 1, pg, svundef_s8()); + SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8()); } -void test_range_0_1(svbool_t pg, void *ptr) { +void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, -1, 0, pg, ptr); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} @@ -67,24 +59,16 @@ void test_range_0_1(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(7, -1, 2, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, -1, -1, 0); - // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 2, -1, 7); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, 0, -1, -1); - // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 7, -1, 2); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(-1, -1, 0, pg, svundef_s16()); + SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, -1, slice); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, -1, 7, pg, svundef_s16()); + SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 2, slice); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(0, -1, -1, pg, svundef_s64()); + SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(-1, slice, pg, svundef_s16()); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(7, -1, 2, pg, svundef_s64()); + SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16()); } -void test_range_0_3(svbool_t pg, void *ptr) { +void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, -1, 0, pg, ptr); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} @@ -103,21 +87,13 @@ void test_range_0_3(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(3, -1, 4, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, -1, -1, 0); - // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 4, -1, 3); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, 0, -1, -1); + SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, -1, slice); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 3, -1, 4); + SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 4, slice); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(-1, -1, 0, pg, svundef_s32()); + SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(-1, slice, pg, svundef_s32()); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(4, -1, 3, pg, svundef_s32()); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(0, -1, -1, pg, svundef_s32()); - // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(3, -1, 4, pg, svundef_s32()); + SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(4, slice, pg, svundef_s32()); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svaddha_za32, _s32, _m,)(4, pg, pg, svundef_s32()); @@ -138,7 +114,7 @@ void test_range_0_3(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8()); } -void test_range_0_7(svbool_t pg, void *ptr) { +void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, -1, 0, pg, ptr); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} @@ -157,21 +133,13 @@ void test_range_0_7(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(1, -1, 8, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, -1, -1, 0); - // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 8, -1, 1); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, 0, -1, -1); - // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 1, -1, 8); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(-1, -1, 0, pg, svundef_s64()); + SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, -1, slice); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(8, -1, 1, pg, svundef_s64()); + SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 8, slice); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(0, -1, -1, pg, svundef_s16()); + SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(-1, slice, pg, svundef_s64()); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(1, -1, 8, pg, svundef_s16()); + SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(8, slice, pg, svundef_s64()); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svaddha_za64, _s64, _m,)(8, pg, pg, svundef_s64()); @@ -192,7 +160,7 @@ void test_range_0_7(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svusmops_za64, _u16, _m,)(-1, pg, pg, svundef_u16(), svundef_s16()); } -void test_range_0_15(svbool_t pg, void *ptr) { +void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, -1, 0, pg, ptr); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} @@ -216,21 +184,13 @@ void test_range_0_15(svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, ptr, -1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, -1, 0); - // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 16, -1, 0); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, 0, -1, -1); - // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 0, -1, 16); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(-1, -1, 0, pg, svundef_s8()); + SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, slice); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, -1, 0, pg, svundef_s8()); + SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 16, slice); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(0, -1, -1, pg, svundef_s8()); + SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(-1, slice, pg, svundef_s8()); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(0, -1, 16, pg, svundef_s8()); + SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8()); } void test_range_0_255(svbool_t pg, void *ptr) { @@ -253,8 +213,6 @@ void test_constant(uint64_t u64, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}} SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}} - SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, 0, u64, u64); // expected-error-re {{argument to 'svread_hor_za8{{.*}}_m' must be a constant integer}} - SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}} - SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(0, u64, u64, pg, svundef_s32()); // expected-error-re {{argument to 'svwrite_hor_za32{{.*}}_m' must be a constant integer}} - SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}} + SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}} + SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}} } -- Gitee From 806974d292c5473d5cd37fb169ce427bab114c27 Mon Sep 17 00:00:00 2001 From: CarolineConcatto <51754594+CarolineConcatto@users.noreply.github.com> Date: Wed, 13 Sep 2023 15:24:09 +0100 Subject: [PATCH 03/77] [AArch64][SME]Update intrinsic interface for ld1/st1 (#65582) The new ACLE PR#225[1] now combines the slice parameters for some builtins. Slice specifies the ZA slice number directly and needs to be explicity implemented by the "user" with the base register plus the immediate offset [1]https://github.com/ARM-software/acle/pull/225/files Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 36 ++++---- clang/lib/CodeGen/CGBuiltin.cpp | 16 ++-- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 40 ++++---- .../acle_sme_ld1_vnum.c | 40 ++++---- .../aarch64-sme-intrinsics/acle_sme_st1.c | 40 ++++---- .../acle_sme_st1_vnum.c | 40 ++++---- .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 92 +++++-------------- .../aarch64-sme-intrinsics/acle_sme_target.c | 8 +- 8 files changed, 134 insertions(+), 178 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index ff159471df90..be9b09980165 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -20,29 +20,29 @@ include "arm_sve_sme_incl.td" multiclass ZALoad ch> { let TargetGuard = "sme" in { - def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimiPQ", t, + def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t, [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], MemEltTyDefault, i_prefix # "_horiz", ch>; - def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimiPQl", t, + def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t, [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], MemEltTyDefault, i_prefix # "_horiz", ch>; - def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimiPQ", t, + def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t, [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], MemEltTyDefault, i_prefix # "_vert", ch>; - def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimiPQl", t, + def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t, [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], MemEltTyDefault, i_prefix # "_vert", ch>; } } -defm SVLD1_ZA8 : ZALoad<"za8", "c", "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>; -defm SVLD1_ZA16 : ZALoad<"za16", "s", "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; -defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; -defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; -defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>; +defm SVLD1_ZA8 : ZALoad<"za8", "c", "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0_0>]>; +defm SVLD1_ZA16 : ZALoad<"za16", "s", "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>]>; +defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>]>; +defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>; +defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>; def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQi", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], @@ -58,29 +58,29 @@ def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", multiclass ZAStore ch> { let TargetGuard = "sme" in { - def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimiP%", t, + def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t, [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], MemEltTyDefault, i_prefix # "_horiz", ch>; - def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimiP%l", t, + def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t, [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], MemEltTyDefault, i_prefix # "_horiz", ch>; - def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimiP%", t, + def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t, [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], MemEltTyDefault, i_prefix # "_vert", ch>; - def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimiP%l", t, + def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t, [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], MemEltTyDefault, i_prefix # "_vert", ch>; } } -defm SVST1_ZA8 : ZAStore<"za8", "c", "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>; -defm SVST1_ZA16 : ZAStore<"za16", "s", "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>; -defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>; -defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>; -defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>; +defm SVST1_ZA8 : ZAStore<"za8", "c", "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0_0>]>; +defm SVST1_ZA16 : ZAStore<"za16", "s", "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>]>; +defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>]>; +defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>; +defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>; def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%i", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index bcad4218a573..19e256e693c4 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9464,29 +9464,29 @@ Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) { Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { - Ops[3] = EmitSVEPredicateCast( - Ops[3], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags))); + Ops[2] = EmitSVEPredicateCast( + Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags))); SmallVector NewOps; - NewOps.push_back(Ops[3]); + NewOps.push_back(Ops[2]); - llvm::Value *BasePtr = Ops[4]; + llvm::Value *BasePtr = Ops[3]; // If the intrinsic contains the vnum parameter, multiply it with the vector // size in bytes. - if (Ops.size() == 6) { + if (Ops.size() == 5) { Function *StreamingVectorLength = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); llvm::Value *StreamingVectorLengthCall = Builder.CreateCall(StreamingVectorLength); llvm::Value *Mulvl = - Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl"); + Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl"); // The type of the ptr parameter is void *, so use Int8Ty here. - BasePtr = Builder.CreateGEP(Int8Ty, Ops[4], Mulvl); + BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl); } NewOps.push_back(BasePtr); NewOps.push_back(Ops[0]); - NewOps.push_back(EmitTileslice(Ops[2], Ops[1])); + NewOps.push_back(Ops[1]); Function *F = CGM.getIntrinsic(IntID); return Builder.CreateCall(F, NewOps); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index c309bde627f7..57ed46995500 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -20,8 +20,8 @@ // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_hor_za8(0, slice_base, 0, pg, ptr); - svld1_hor_za8(0, slice_base, 15, pg, ptr); + svld1_hor_za8(0, slice_base, pg, ptr); + svld1_hor_za8(0, slice_base + 15, pg, ptr); } // CHECK-C-LABEL: @test_svld1_hor_za16( @@ -34,8 +34,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, con // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_hor_za16(0, slice_base, 0, pg, ptr); - svld1_hor_za16(1, slice_base, 7, pg, ptr); + svld1_hor_za16(0, slice_base, pg, ptr); + svld1_hor_za16(1, slice_base + 7, pg, ptr); } // CHECK-C-LABEL: @test_svld1_hor_za32( @@ -48,8 +48,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_hor_za32(0, slice_base, 0, pg, ptr); - svld1_hor_za32(3, slice_base, 3, pg, ptr); + svld1_hor_za32(0, slice_base, pg, ptr); + svld1_hor_za32(3, slice_base + 3, pg, ptr); } // CHECK-C-LABEL: @test_svld1_hor_za64( @@ -62,8 +62,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_hor_za64(0, slice_base, 0, pg, ptr); - svld1_hor_za64(7, slice_base, 1, pg, ptr); + svld1_hor_za64(0, slice_base, pg, ptr); + svld1_hor_za64(7, slice_base + 1, pg, ptr); } // CHECK-C-LABEL: @test_svld1_hor_za128( @@ -75,8 +75,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_hor_za128(0, slice_base, 0, pg, ptr); - svld1_hor_za128(15, slice_base, 0, pg, ptr); + svld1_hor_za128(0, slice_base, pg, ptr); + svld1_hor_za128(15, slice_base, pg, ptr); } // CHECK-C-LABEL: @test_svld1_ver_za8( @@ -88,8 +88,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, c // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_ver_za8(0, slice_base, 0, pg, ptr); - svld1_ver_za8(0, slice_base, 15, pg, ptr); + svld1_ver_za8(0, slice_base, pg, ptr); + svld1_ver_za8(0, slice_base + 15, pg, ptr); } // CHECK-C-LABEL: @test_svld1_ver_za16( @@ -102,8 +102,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, con // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_ver_za16(0, slice_base, 0, pg, ptr); - svld1_ver_za16(1, slice_base, 7, pg, ptr); + svld1_ver_za16(0, slice_base, pg, ptr); + svld1_ver_za16(1, slice_base + 7, pg, ptr); } // CHECK-C-LABEL: @test_svld1_ver_za32( @@ -116,8 +116,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_ver_za32(0, slice_base, 0, pg, ptr); - svld1_ver_za32(3, slice_base, 3, pg, ptr); + svld1_ver_za32(0, slice_base, pg, ptr); + svld1_ver_za32(3, slice_base + 3, pg, ptr); } // CHECK-C-LABEL: @test_svld1_ver_za64( @@ -130,8 +130,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_ver_za64(0, slice_base, 0, pg, ptr); - svld1_ver_za64(7, slice_base, 1, pg, ptr); + svld1_ver_za64(0, slice_base, pg, ptr); + svld1_ver_za64(7, slice_base + 1, pg, ptr); } // CHECK-C-LABEL: @test_svld1_ver_za128( @@ -143,6 +143,6 @@ ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { - svld1_ver_za128(0, slice_base, 0, pg, ptr); - svld1_ver_za128(15, slice_base, 0, pg, ptr); + svld1_ver_za128(0, slice_base, pg, ptr); + svld1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 6c80ef55f818..5d61587d8557 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -23,8 +23,8 @@ // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum); - svld1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum); + svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); + svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_hor_vnum_za16( @@ -40,8 +40,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum); - svld1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum); + svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); + svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_hor_vnum_za32( @@ -57,8 +57,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum); - svld1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum); + svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); + svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_hor_vnum_za64( @@ -74,8 +74,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum); - svld1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum); + svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); + svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_hor_vnum_za128( @@ -90,8 +90,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum); - svld1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum); + svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); + svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_ver_hor_za8( @@ -106,8 +106,8 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum); - svld1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum); + svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); + svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_ver_vnum_za16( @@ -123,8 +123,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum); - svld1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum); + svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); + svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_ver_vnum_za32( @@ -140,8 +140,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum); - svld1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum); + svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); + svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_ver_vnum_za64( @@ -157,8 +157,8 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum); - svld1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum); + svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); + svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svld1_ver_vnum_za128( @@ -173,6 +173,6 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { - svld1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum); - svld1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum); + svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); + svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index 067745f7d4a0..eec542341670 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -20,8 +20,8 @@ // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_hor_za8(0, slice_base, 0, pg, ptr); - svst1_hor_za8(0, slice_base, 15, pg, ptr); + svst1_hor_za8(0, slice_base, pg, ptr); + svst1_hor_za8(0, slice_base + 15, pg, ptr); } // CHECK-C-LABEL: @test_svst1_hor_za16( @@ -34,8 +34,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, voi // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_hor_za16(0, slice_base, 0, pg, ptr); - svst1_hor_za16(1, slice_base, 7, pg, ptr); + svst1_hor_za16(0, slice_base, pg, ptr); + svst1_hor_za16(1, slice_base + 7, pg, ptr); } // CHECK-C-LABEL: @test_svst1_hor_za32( @@ -48,8 +48,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_hor_za32(0, slice_base, 0, pg, ptr); - svst1_hor_za32(3, slice_base, 3, pg, ptr); + svst1_hor_za32(0, slice_base, pg, ptr); + svst1_hor_za32(3, slice_base + 3, pg, ptr); } // CHECK-C-LABEL: @test_svst1_hor_za64( @@ -62,8 +62,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_hor_za64(0, slice_base, 0, pg, ptr); - svst1_hor_za64(7, slice_base, 1, pg, ptr); + svst1_hor_za64(0, slice_base, pg, ptr); + svst1_hor_za64(7, slice_base + 1, pg, ptr); } // CHECK-C-LABEL: @test_svst1_hor_za128( @@ -75,8 +75,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_hor_za128(0, slice_base, 0, pg, ptr); - svst1_hor_za128(15, slice_base, 0, pg, ptr); + svst1_hor_za128(0, slice_base, pg, ptr); + svst1_hor_za128(15, slice_base, pg, ptr); } // CHECK-C-LABEL: @test_svst1_ver_za8( @@ -88,8 +88,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, v // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_ver_za8(0, slice_base, 0, pg, ptr); - svst1_ver_za8(0, slice_base, 15, pg, ptr); + svst1_ver_za8(0, slice_base, pg, ptr); + svst1_ver_za8(0, slice_base + 15, pg, ptr); } // CHECK-C-LABEL: @test_svst1_ver_za16( @@ -102,8 +102,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, voi // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_ver_za16(0, slice_base, 0, pg, ptr); - svst1_ver_za16(1, slice_base, 7, pg, ptr); + svst1_ver_za16(0, slice_base, pg, ptr); + svst1_ver_za16(1, slice_base + 7, pg, ptr); } // CHECK-C-LABEL: @test_svst1_ver_za32( @@ -116,8 +116,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_ver_za32(0, slice_base, 0, pg, ptr); - svst1_ver_za32(3, slice_base, 3, pg, ptr); + svst1_ver_za32(0, slice_base, pg, ptr); + svst1_ver_za32(3, slice_base + 3, pg, ptr); } // CHECK-C-LABEL: @test_svst1_ver_za64( @@ -130,8 +130,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_ver_za64(0, slice_base, 0, pg, ptr); - svst1_ver_za64(7, slice_base, 1, pg, ptr); + svst1_ver_za64(0, slice_base, pg, ptr); + svst1_ver_za64(7, slice_base + 1, pg, ptr); } // CHECK-C-LABEL: @test_svst1_ver_za128( @@ -143,6 +143,6 @@ ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) { - svst1_ver_za128(0, slice_base, 0, pg, ptr); - svst1_ver_za128(15, slice_base, 0, pg, ptr); + svst1_ver_za128(0, slice_base, pg, ptr); + svst1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index 4af93ac38dcc..81a2bba953b8 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -23,8 +23,8 @@ // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum); - svst1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum); + svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); + svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_hor_vnum_za16( @@ -40,8 +40,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum); - svst1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum); + svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); + svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_hor_vnum_za32( @@ -57,8 +57,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum); - svst1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum); + svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); + svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_hor_vnum_za64( @@ -74,8 +74,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum); - svst1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum); + svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); + svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_hor_vnum_za128( @@ -90,8 +90,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum); - svst1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum); + svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); + svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_ver_vnum_za8( @@ -106,8 +106,8 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum); - svst1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum); + svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); + svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_ver_vnum_za16( @@ -123,8 +123,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum); - svst1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum); + svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); + svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_ver_vnum_za32( @@ -140,8 +140,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum); - svst1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum); + svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); + svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_ver_vnum_za64( @@ -157,8 +157,8 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum); - svst1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum); + svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); + svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } // CHECK-C-LABEL: @test_svst1_ver_vnum_za128( @@ -173,6 +173,6 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: ret void // ARM_STREAMING_ATTR void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { - svst1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum); - svst1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum); + svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); + svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 3316c28cebdb..9b88d463d5e2 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -14,21 +14,13 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, -1, 0, pg, ptr); + SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svst1_ver_za8,,,)(1, -1, 15, pg, ptr); + SVE_ACLE_FUNC(svst1_ver_za8,,,)(1, slice, pg, ptr); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svld1_hor_za128,,,)(0, -1, -1, pg, ptr); + SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(-1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svst1_ver_za128,,,)(15, -1, 1, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(-1, -1, 0, pg, ptr, 1); - // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(1, -1, 15, pg, ptr, 1); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(0, -1, -1, pg, ptr, 1); - // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(15, -1, 1, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, -1, slice); @@ -42,21 +34,13 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) { void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, -1, 0, pg, ptr); - // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svst1_ver_za16,,,)(2, -1, 7, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svld1_hor_za64,,,)(0, -1, -1, pg, ptr); - // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svst1_ver_za64,,,)(7, -1, 2, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(-1, -1, 0, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(2, -1, 7, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_za16,,,)(2, slice, pg, ptr); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(0, -1, -1, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(-1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(7, -1, 2, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(2, slice, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, -1, slice); @@ -70,21 +54,13 @@ void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) { void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, -1, 0, pg, ptr); - // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svst1_ver_za32,,,)(4, -1, 3, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svld1_hor_za32,,,)(0, -1, -1, pg, ptr); + SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svst1_ver_za32,,,)(3, -1, 4, pg, ptr); + SVE_ACLE_FUNC(svst1_ver_za32,,,)(4, slice, pg, ptr); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(-1, -1, 0, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(-1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(4, -1, 3, pg, ptr, 1); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(0, -1, -1, pg, ptr, 1); - // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(3, -1, 4, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(4, slice, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, -1, slice); @@ -116,21 +92,13 @@ void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) { void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, -1, 0, pg, ptr); - // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svst1_ver_za64,,,)(8, -1, 1, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svld1_hor_za16,,,)(0, -1, -1, pg, ptr); - // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svst1_ver_za16,,,)(1, -1, 8, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(-1, -1, 0, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(8, -1, 1, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_za64,,,)(8, slice, pg, ptr); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(0, -1, -1, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(-1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(1, -1, 8, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(8, slice, pg, ptr, 1); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, -1, slice); @@ -162,21 +130,13 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, -1, 0, pg, ptr); - // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svst1_ver_za128,,,)(16, -1, 0, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svld1_hor_za8,,,)(0, -1, -1, pg, ptr); - // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svst1_ver_za8,,,)(0, -1, 16, pg, ptr); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(-1, -1, 0, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, -1, 0, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_za128,,,)(16, slice, pg, ptr); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(0, -1, -1, pg, ptr, 1); + SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(-1, slice, pg, ptr, 1); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(0, -1, 16, pg, ptr, 1); + SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, slice, pg, ptr, 1); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, ptr, 16); @@ -201,14 +161,10 @@ void test_range_0_255(svbool_t pg, void *ptr) { } void test_constant(uint64_t u64, svbool_t pg, void *ptr) { - SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, 0, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} - SVE_ACLE_FUNC(svld1_ver_za16,,,)(0, u64, u64, pg, ptr); // expected-error {{argument to 'svld1_ver_za16' must be a constant integer}} - SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}} - SVE_ACLE_FUNC(svst1_ver_za64,,,)(0, u64, u64, pg, ptr); // expected-error {{argument to 'svst1_ver_za64' must be a constant integer}} - SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} - SVE_ACLE_FUNC(svld1_ver_vnum_za16,,,)(0, u64, u64, pg, ptr, u64); // expected-error {{argument to 'svld1_ver_vnum_za16' must be a constant integer}} - SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}} - SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(0, u64, u64, pg, ptr, u64); // expected-error {{argument to 'svst1_ver_vnum_za64' must be a constant integer}} + SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} + SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}} + SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} + SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}} SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}} SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c index b384244ac6c6..2de6d9f6877f 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -7,19 +7,19 @@ __attribute__((target("sme"))) void test_sme(svbool_t pg, void *ptr) { - svld1_hor_za8(0, 0, 0, pg, ptr); + svld1_hor_za8(0, 0, pg, ptr); } __attribute__((target("arch=armv8-a+sme"))) void test_arch_sme(svbool_t pg, void *ptr) { - svld1_hor_vnum_za32(0, 0, 0, pg, ptr, 0); + svld1_hor_vnum_za32(0, 0, pg, ptr, 0); } __attribute__((target("+sme"))) void test_plus_sme(svbool_t pg, void *ptr) { - svst1_ver_za16(0, 0, 0, pg, ptr); + svst1_ver_za16(0, 0, pg, ptr); } void undefined(svbool_t pg, void *ptr) { - svst1_ver_vnum_za64(0, 0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}} + svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}} } -- Gitee From 97116000e0386bd129b0304683c8330497c95da8 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 7 Aug 2023 10:43:38 +0000 Subject: [PATCH 04/77] [Clang] Make __arm_streaming apply only to prototyped functions. Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D152141 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/Attr.td | 3 +- clang/include/clang/Basic/AttrDocs.td | 4 +- clang/test/Parser/c2x-attribute-keywords.c | 26 +++---- clang/test/Parser/c2x-attribute-keywords.m | 2 +- .../test/Parser/cxx0x-keyword-attributes.cpp | 74 +++++++++---------- 5 files changed, 56 insertions(+), 53 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index d5204b286966..1cd5d8a1f552 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2435,7 +2435,8 @@ def AArch64SVEPcs: DeclOrTypeAttr { def ArmStreaming : TypeAttr, TargetSpecificAttr { let Spellings = [RegularKeyword<"__arm_streaming">]; - let Documentation = [ArmStreamingDocs]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingDocs]; } def Pure : InheritableAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 2c950231255d..50efb1d0ba76 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6578,7 +6578,7 @@ Requirements on Development Tools - Engineering Specification Documentation }]; } -def ArmStreamingDocs : Documentation { +def ArmSmeStreamingDocs : Documentation { let Category = DocCatType; let Content = [{ .. Note:: This attribute has not been implemented yet, but once it is @@ -6595,6 +6595,8 @@ It applies to function types and specifies that the function has a * the function must return in streaming mode +* the function does not have a K&R-style unprototyped function type. + See `Procedure Call Standard for the Arm® 64-bit Architecture (AArch64) `_ for more details about streaming-interface functions. diff --git a/clang/test/Parser/c2x-attribute-keywords.c b/clang/test/Parser/c2x-attribute-keywords.c index 757dc8286011..d8291b710e6d 100644 --- a/clang/test/Parser/c2x-attribute-keywords.c +++ b/clang/test/Parser/c2x-attribute-keywords.c @@ -1,13 +1,13 @@ // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %s // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %s -enum __arm_streaming E { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} - One __arm_streaming, // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum __arm_streaming E { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} + One __arm_streaming, // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} Two, - Three __arm_streaming // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + Three __arm_streaming // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} }; -enum __arm_streaming { Four }; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum __arm_streaming { Four }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming enum E2 { Five }; // expected-error {{misplaced '__arm_streaming'}} // FIXME: this diagnostic can be improved. @@ -16,7 +16,7 @@ enum { __arm_streaming Six }; // expected-error {{expected identifier}} // FIXME: this diagnostic can be improved. enum E3 __arm_streaming { Seven }; // expected-error {{expected identifier or '('}} -struct __arm_streaming S1 { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +struct __arm_streaming S1 { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} int i __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} int __arm_streaming j; // expected-error {{'__arm_streaming' only applies to function types}} int k[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} @@ -32,20 +32,20 @@ struct __arm_streaming S1 { // expected-error {{'__arm_streaming' cannot be appl __arm_streaming struct S2 { int a; }; // expected-error {{misplaced '__arm_streaming'}} struct S3 __arm_streaming { int a; }; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -union __arm_streaming U { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +union __arm_streaming U { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} double d __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types; type here is 'double'}} __arm_streaming int i; // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}} }; __arm_streaming union U2 { double d; }; // expected-error {{misplaced '__arm_streaming'}} union U3 __arm_streaming { double d; }; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -struct __arm_streaming IncompleteStruct; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -union __arm_streaming IncompleteUnion; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -enum __arm_streaming IncompleteEnum; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +struct __arm_streaming IncompleteStruct; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +union __arm_streaming IncompleteUnion; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_streaming IncompleteEnum; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming void f1(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} void __arm_streaming f2(void); // expected-error {{'__arm_streaming' only applies to function types}} @@ -95,7 +95,7 @@ void f11(void) { } goto foo; - __arm_streaming foo: (void)1; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + __arm_streaming foo: (void)1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}} __arm_streaming while (1); // expected-error {{'__arm_streaming' cannot be applied to a statement}} @@ -106,7 +106,7 @@ void f11(void) { __arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a statement}} (void)sizeof(int [4]__arm_streaming); // expected-error {{'__arm_streaming' only applies to function types}} - (void)sizeof(struct __arm_streaming S3 { int a __arm_streaming; }); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} \ + (void)sizeof(struct __arm_streaming S3 { int a __arm_streaming; }); // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \ // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}} __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}} diff --git a/clang/test/Parser/c2x-attribute-keywords.m b/clang/test/Parser/c2x-attribute-keywords.m index d1c45da34fbc..2296be13cb71 100644 --- a/clang/test/Parser/c2x-attribute-keywords.m +++ b/clang/test/Parser/c2x-attribute-keywords.m @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify %s -enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} @interface Base @end diff --git a/clang/test/Parser/cxx0x-keyword-attributes.cpp b/clang/test/Parser/cxx0x-keyword-attributes.cpp index 256a834e9e54..8d31efac5320 100644 --- a/clang/test/Parser/cxx0x-keyword-attributes.cpp +++ b/clang/test/Parser/cxx0x-keyword-attributes.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu %s +// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme %s // Need std::initializer_list namespace std { @@ -48,10 +48,10 @@ void noexcept_fn_attr () noexcept __arm_streaming; struct MemberFnOrder { virtual void f() const volatile && noexcept __arm_streaming final = 0; }; -struct __arm_streaming struct_attr; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -class __arm_streaming class_attr {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -union __arm_streaming union_attr; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -enum __arm_streaming E { }; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +struct __arm_streaming struct_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +class __arm_streaming class_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +union __arm_streaming union_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_streaming E { }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} namespace test_misplacement { __arm_streaming struct struct_attr2; // expected-error {{misplaced '__arm_streaming'}} __arm_streaming class class_attr2; // expected-error {{misplaced '__arm_streaming'}} @@ -60,28 +60,28 @@ __arm_streaming enum E2 { }; // expected-error {{misplaced '__arm_streaming'}} } // Checks attributes placed at wrong syntactic locations of class specifiers. -class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} +class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} attr_after_class_name_decl __arm_streaming __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} -class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} +class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} attr_after_class_name_definition __arm_streaming __arm_streaming __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 3 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 3 {{'__arm_streaming' only applies to non-K&R-style functions}} -class __arm_streaming c {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +class __arm_streaming c {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} class c __arm_streaming __arm_streaming x; // expected-error 2 {{'__arm_streaming' only applies to function types}} class c __arm_streaming __arm_streaming y __arm_streaming __arm_streaming; // expected-error 4 {{'__arm_streaming' only applies to function types}} class c final [(int){0}]; class base {}; -class __arm_streaming __arm_streaming final_class // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} +class __arm_streaming __arm_streaming final_class // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming alignas(float) final // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming alignas(float) __arm_streaming alignas(float): base{}; // expected-error {{'__arm_streaming' cannot appear here}} -class __arm_streaming __arm_streaming final_class_another // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} +class __arm_streaming __arm_streaming final_class_another // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming __arm_streaming alignas(16) final // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming __arm_streaming alignas(16) __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} class after_class_close {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "class" to apply it to the type declaration}} @@ -95,7 +95,7 @@ void fn_with_structs() { __arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}} __arm_streaming struct no_init_declarators; // expected-error {{'__arm_streaming' cannot appear here}} } -__arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} struct ctordtor { __arm_streaming ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} ctordtor (C) __arm_streaming; @@ -122,16 +122,16 @@ __arm_streaming static_assert(true, ""); //expected-error {{'__arm_streaming' ca __arm_streaming asm(""); // expected-error {{'__arm_streaming' cannot appear here}} __arm_streaming using ns::i; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} -__arm_streaming using namespace ns; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +__arm_streaming using namespace ns; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} namespace __arm_streaming ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} using __arm_streaming alignas(4)__arm_streaming ns::i; // expected-warning 2 {{ISO C++}} \ expected-error {{'__arm_streaming' cannot appear here}} \ expected-error {{'alignas' attribute only applies to variables, data members and tag types}} \ expected-warning {{ISO C++}} \ - expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} using __arm_streaming alignas(4) __arm_streaming foobar = int; // expected-error {{'__arm_streaming' cannot appear here}} \ expected-error {{'alignas' attribute only applies to}} \ expected-error 2 {{'__arm_streaming' only applies to function types}} @@ -140,25 +140,25 @@ __arm_streaming using T = int; // expected-error {{'__arm_streaming' cannot appe using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}} template using U __arm_streaming = T; // expected-error {{'__arm_streaming' only applies to function types}} using ns::i __arm_streaming; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} using ns::i __arm_streaming, ns::i __arm_streaming; // expected-warning 2 {{ISO C++}} \ expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \ - expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} struct using_in_struct_base { typedef int i, j, k, l; }; struct using_in_struct : using_in_struct_base { __arm_streaming using using_in_struct_base::i; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} using using_in_struct_base::j __arm_streaming; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming using using_in_struct_base::k __arm_streaming, using_in_struct_base::l __arm_streaming; // expected-warning 3 {{ISO C++}} \ expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \ - expected-error 4 {{'__arm_streaming' cannot be applied to a declaration}} + expected-error 4 {{'__arm_streaming' only applies to non-K&R-style functions}} }; using __arm_streaming ns::i; // expected-warning {{ISO C++}} \ expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}} auto trailing() -> __arm_streaming const int; // expected-error {{'__arm_streaming' cannot appear here}} @@ -177,20 +177,20 @@ struct __arm_streaming Template t; // expected-error {{'__arm_streaming' ca struct __arm_streaming ::template Template u; // expected-error {{'__arm_streaming' cannot appear here}} template struct __arm_streaming Template; // expected-error {{'__arm_streaming' cannot appear here}} template struct __attribute__((pure)) Template; // We still allow GNU-style attributes here -template <> struct __arm_streaming Template; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +template <> struct __arm_streaming Template; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming E1 {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum __arm_streaming E1 {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} enum __arm_streaming E2; // expected-error {{forbids forward references}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} -enum __arm_streaming E1; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -enum __arm_streaming E3 : int; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -enum __arm_streaming { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_streaming E1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_streaming E3 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_streaming { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} k_123 __arm_streaming = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} + expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} }; enum __arm_streaming E1 e; // expected-error {{'__arm_streaming' cannot appear here}} enum __arm_streaming class E4 { }; // expected-error {{'__arm_streaming' cannot appear here}} -enum struct __arm_streaming E5; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum struct __arm_streaming E5; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} enum E6 {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "enum" to apply it to the type declaration}} struct S { @@ -229,7 +229,7 @@ void foo () { } __arm_streaming goto there; // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming there: // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + __arm_streaming there: // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} __arm_streaming try { // expected-error {{'__arm_streaming' cannot be applied to a statement}} } __arm_streaming catch (...) { // expected-error {{'__arm_streaming' cannot appear here}} @@ -277,7 +277,7 @@ void baz () { enum class __attribute__((visibility("hidden"))) SecretKeepers { one, /* rest are deprecated */ two, three }; -enum class __arm_streaming EvenMoreSecrets {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +enum class __arm_streaming EvenMoreSecrets {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} // Forbid attributes on decl specifiers. unsigned __arm_streaming static int __arm_streaming v1; // expected-error {{'__arm_streaming' only applies to function types}} \ @@ -286,7 +286,7 @@ typedef __arm_streaming unsigned long __arm_streaming v2; // expected-error {{'_ expected-error {{'__arm_streaming' cannot appear here}} int __arm_streaming foo(int __arm_streaming x); // expected-error 2 {{'__arm_streaming' only applies to function types}} -__arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} class A { A(__arm_streaming int a); // expected-error {{'__arm_streaming' only applies to function types}} @@ -341,5 +341,5 @@ struct F : virtual __arm_streaming public A {}; // expected-error {{'__arm_strea expected-error {{'__arm_streaming' cannot be applied to a base specifier}} } -namespace __arm_streaming ns_attr {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} \ +namespace __arm_streaming ns_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \ expected-warning {{attributes on a namespace declaration are a C++17 extension}} -- Gitee From 41d6c9a0641da93d5d02dd34fae95a006438c67c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 9 Aug 2023 09:59:46 +0000 Subject: [PATCH 05/77] [Clang][AArch64] Diagnostics for SME attributes when target doesn't have 'sme' This patch adds error diagnostics to Clang when code uses the AArch64 SME attributes without specifying 'sme' as available target attribute. * Function definitions marked as '__arm_streaming', '__arm_locally_streaming', '__arm_shared_za' or '__arm_new_za' will by definition use or require SME instructions. * Calls from non-streaming functions to streaming-functions require the compiler to enable/disable streaming-SVE mode around the call-site. In some cases we can accept the SME attributes without having 'sme' enabled: * Function declaration can have the SME attributes. * Definitions can be __arm_streaming_compatible since the generated code should execute on processing elements without SME. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D157269 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../clang/Basic/DiagnosticSemaKinds.td | 8 ++ clang/lib/Sema/SemaChecking.cpp | 18 ++++- clang/lib/Sema/SemaDecl.cpp | 27 +++++++ ...-sme-func-attrs-without-target-feature.cpp | 48 ++++++++++++ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 77 ++++++------------- .../Target/AArch64/AArch64TargetMachine.cpp | 2 - llvm/lib/Target/AArch64/SMEInstrFormats.td | 36 +++++++++ ...compatible-to-normal-fn-wihout-sme-attr.ll | 41 ++++++++++ .../test/MC/AArch64/SME/directives-negative.s | 8 +- llvm/test/MC/AArch64/SME/smstart.s | 16 +--- llvm/test/MC/AArch64/SME/smstop.s | 16 +--- llvm/test/MC/AArch64/SME/system-regs.s | 12 +-- 12 files changed, 217 insertions(+), 92 deletions(-) create mode 100644 clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0e97620945af..2c10ce943229 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3621,6 +3621,14 @@ def err_attribute_vecreturn_only_vector_member : Error< "the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">; def err_attribute_vecreturn_only_pod_record : Error< "the vecreturn attribute can only be used on a POD (plain old data) class or structure (i.e. no virtual functions)">; +def err_sme_attr_mismatch : Error< + "function declared %0 was previously declared %1, which has different SME function attributes">; +def err_sme_call_in_non_sme_target : Error< + "call to a streaming function requires 'sme'">; +def err_sme_definition_using_sm_in_non_sme_target : Error< + "function executed in streaming-SVE mode requires 'sme'">; +def err_sme_definition_using_za_in_non_sme_target : Error< + "function using ZA state requires 'sme'">; def err_cconv_change : Error< "function declared '%0' here was previously declared " "%select{'%2'|without calling convention}1">; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 5ee20554c4cf..0af822a3d709 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -7053,8 +7053,8 @@ void Sema::CheckArgAlignment(SourceLocation Loc, NamedDecl *FDecl, } /// Handles the checks for format strings, non-POD arguments to vararg -/// functions, NULL arguments passed to non-NULL parameters, and diagnose_if -/// attributes. +/// functions, NULL arguments passed to non-NULL parameters, diagnose_if +/// attributes and AArch64 SME attributes. void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, const Expr *ThisArg, ArrayRef Args, bool IsMemberFunction, SourceLocation Loc, @@ -7135,6 +7135,20 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, ArgTy, ParamTy); } } + + // If the callee has an AArch64 SME attribute to indicate that it is an + // __arm_streaming function, then the caller requires SME to be available. + FunctionProtoType::ExtProtoInfo ExtInfo = Proto->getExtProtoInfo(); + if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask) { + if (auto *CallerFD = dyn_cast(CurContext)) { + llvm::StringMap CallerFeatureMap; + Context.getFunctionFeatureMap(CallerFeatureMap, CallerFD); + if (!CallerFeatureMap.contains("sme")) + Diag(Loc, diag::err_sme_call_in_non_sme_target); + } else if (!Context.getTargetInfo().hasFeature("sme")) { + Diag(Loc, diag::err_sme_call_in_non_sme_target); + } + } } if (FDecl && FDecl->hasAttr()) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 5481bbd22c66..d0b86ee97544 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -12081,6 +12081,33 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, if (!Redeclaration && LangOpts.CUDA) checkCUDATargetOverload(NewFD, Previous); } + + // Check if the function definition uses any AArch64 SME features without + // having the '+sme' feature enabled. + if (DeclIsDefn) { + bool UsesSM = NewFD->hasAttr(); + bool UsesZA = NewFD->hasAttr(); + if (const auto *FPT = NewFD->getType()->getAs()) { + FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo(); + UsesSM |= + EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask; + UsesZA |= EPI.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask; + } + + if (UsesSM || UsesZA) { + llvm::StringMap FeatureMap; + Context.getFunctionFeatureMap(FeatureMap, NewFD); + if (!FeatureMap.contains("sme")) { + if (UsesSM) + Diag(NewFD->getLocation(), + diag::err_sme_definition_using_sm_in_non_sme_target); + else + Diag(NewFD->getLocation(), + diag::err_sme_definition_using_za_in_non_sme_target); + } + } + } + return Redeclaration; } diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp new file mode 100644 index 000000000000..b59d67f7f57b --- /dev/null +++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp @@ -0,0 +1,48 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s + +// This test is testing the diagnostics that Clang emits when compiling without '+sme'. + +void streaming_compatible_def() __arm_streaming_compatible {} // OK +void streaming_def() __arm_streaming { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} +void shared_za_def() __arm_shared_za { } // expected-error {{function using ZA state requires 'sme'}} +__arm_new_za void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}} +__arm_locally_streaming void locally_streaming_def() { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} +void streaming_shared_za_def() __arm_streaming __arm_shared_za { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} + +// It should work fine when we explicitly add the target("sme") attribute. +__attribute__((target("sme"))) void streaming_compatible_def_sme_attr() __arm_streaming_compatible {} // OK +__attribute__((target("sme"))) void streaming_def_sme_attr() __arm_streaming { } // OK +__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_shared_za { } // OK +__arm_new_za __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK +__arm_locally_streaming __attribute__((target("sme"))) void locally_streaming_def_sme_attr() {} // OK + +// Test that it also works with the target("sme2") attribute. +__attribute__((target("sme2"))) void streaming_def_sme2_attr() __arm_streaming { } // OK + +// No code is generated for declarations, so it should be fine to declare using the attribute. +void streaming_compatible_decl() __arm_streaming_compatible; // OK +void streaming_decl() __arm_streaming; // OK +void shared_za_decl() __arm_shared_za; // OK + +void non_streaming_decl(); +void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming, + void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible) { + streaming_compatible_decl(); // OK + streaming_compatible_fn_ptr(); // OK + streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}} + streaming_fn_ptr(); // expected-error {{call to a streaming function requires 'sme'}} +} + +void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming, + void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible) + __arm_streaming_compatible { + non_streaming_decl(); // OK + streaming_compatible_decl(); // OK + streaming_compatible_fn_ptr(); // OK + streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}} + streaming_fn_ptr(); // expected-error {{call to a streaming function requires 'sme'}} +} + +// Also test when call-site is not a function. +int streaming_decl_ret_int() __arm_streaming; +int x = streaming_decl_ret_int(); // expected-error {{call to a streaming function requires 'sme'}} diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index cabfe9def7c2..352f34cab132 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -134,52 +134,6 @@ defm ZERO_M : sme_zero<"zero">; // Mode selection and state access instructions //===----------------------------------------------------------------------===// -// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or -// both fields: -// -// MSR SVCRSM, # -// MSR SVCRZA, # -// MSR SVCRSMZA, # -// -// It's tricky to using the existing pstate operand defined in -// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2, -// when these fields are also encoded in CRm[3:1]. -def MSRpstatesvcrImm1 - : PstateWriteSimple<(ins svcr_op:$pstatefield, timm0_1:$imm), "msr", - "\t$pstatefield, $imm">, - Sched<[WriteSys]> { - bits<3> pstatefield; - bit imm; - let Inst{18-16} = 0b011; // op1 - let Inst{11-9} = pstatefield; - let Inst{8} = imm; - let Inst{7-5} = 0b011; // op2 -} - -def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; -def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>; -def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>; - -def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>; -def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>; -def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; - - -// Pseudo to match to smstart/smstop. This expands: -// -// pseudonode (pstate_za|pstate_sm), before_call, expected_value -// -// Into: -// -// if (before_call != expected_value) -// node (pstate_za|pstate_sm) -// -// where node can be either 'smstart' or 'smstop'. -def MSRpstatePseudo : - Pseudo<(outs), - (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>, - Sched<[WriteSys]>; - // Pseudo to conditionally restore ZA state. This expands: // // pseudonode tpidr2_el0, tpidr2obj, restore_routine @@ -226,12 +180,6 @@ def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)), // before def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)), // after call (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>; -// The generic case which gets expanded to a pseudo node. -def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), - (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>; -def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), - (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>; - // Read and write TPIDR2_EL0 def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), (MSR 0xde85, GPR64:$val)>; @@ -243,6 +191,31 @@ def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))), (OBSCURE_COPY GPR64:$idx)>; } // End let Predicates = [HasSME] +// Pseudo to match to smstart/smstop. This expands: +// +// pseudonode (pstate_za|pstate_sm), before_call, expected_value +// +// Into: +// +// if (before_call != expected_value) +// node (pstate_za|pstate_sm) +// +// where node can be either 'smstart' or 'smstop'. +// +// This pseudo and corresponding patterns don't need to be predicated by SME, +// because when they're emitted for streaming-compatible functions and run +// in a non-SME context the generated code-paths will never execute any +// SME instructions. +def MSRpstatePseudo : + Pseudo<(outs), + (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>, + Sched<[WriteSys]>; + +def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), + (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>; +def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), + (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>; + //===----------------------------------------------------------------------===// // SME2 Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 559879139758..d2c12e6dfdc7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -446,8 +446,6 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { assert((!StreamingSVEMode || I->hasSME()) && "Expected SME to be available"); - assert((!StreamingCompatibleSVEMode || I->hasSVEorSME()) && - "Expected SVE or SME to be available"); return I.get(); } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 6e3aadd5dd8c..b135fec03a8c 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -190,6 +190,42 @@ class SME_ZA_Tile_TwoPred_TwoVec_Pat(name # _PSEUDO) $tile, $Pn, $Pm, $Zn, $Zm)>; + +//===----------------------------------------------------------------------===// +// SME smstart/smstop +//===----------------------------------------------------------------------===// + +// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or +// both fields: +// +// MSR SVCRSM, # +// MSR SVCRZA, # +// MSR SVCRSMZA, # +// +// It's tricky to using the existing pstate operand defined in +// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2, +// when these fields are also encoded in CRm[3:1]. +def MSRpstatesvcrImm1 + : PstateWriteSimple<(ins svcr_op:$pstatefield, timm0_1:$imm), "msr", + "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<3> pstatefield; + bit imm; + let Inst{18-16} = 0b011; // op1 + let Inst{11-9} = pstatefield; + let Inst{8} = imm; + let Inst{7-5} = 0b011; // op2 +} + +def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; +def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>; +def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>; + +def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>; +def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>; +def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; + + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll new file mode 100644 index 000000000000..cffbadc53552 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +; Verify that the following code can be compiled without +sme, because if the +; call is not entered in streaming-SVE mode at runtime, the codepath leading +; to the smstop/smstart pair will not be executed either. + +target triple = "aarch64" + +define void @streaming_compatible() #0 { +; CHECK-LABEL: streaming_compatible: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: bl non_streaming +; CHECK-NEXT: tbz x19, #0, .LBB0_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @non_streaming() + ret void +} + +declare void @non_streaming() + +attributes #0 = { nounwind "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/MC/AArch64/SME/directives-negative.s b/llvm/test/MC/AArch64/SME/directives-negative.s index 3df90b301686..123c3a383d71 100644 --- a/llvm/test/MC/AArch64/SME/directives-negative.s +++ b/llvm/test/MC/AArch64/SME/directives-negative.s @@ -2,9 +2,9 @@ .arch_extension sme .arch_extension nosme -smstart +zero {za} // CHECK: error: instruction requires: sme -// CHECK-NEXT: smstart +// CHECK-NEXT: zero {za} .arch_extension sme-f64f64 .arch_extension nosme-f64f64 @@ -20,9 +20,9 @@ addha za0.d, p0/m, p0/m, z0.d .arch armv9-a+sme .arch armv9-a+nosme -smstart +zero {za} // CHECK: error: instruction requires: sme -// CHECK-NEXT: smstart +// CHECK-NEXT: zero {za} .arch armv9-a+sme-f64f64 .arch armv9-a+nosme-f64f64 diff --git a/llvm/test/MC/AArch64/SME/smstart.s b/llvm/test/MC/AArch64/SME/smstart.s index 1628a279bb35..07a696d28df8 100644 --- a/llvm/test/MC/AArch64/SME/smstart.s +++ b/llvm/test/MC/AArch64/SME/smstart.s @@ -1,38 +1,28 @@ // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST -// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ -// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \ // RUN: | llvm-objdump -d --mattr=+sme - | FileCheck %s --check-prefix=CHECK-INST // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \ -// RUN: | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// RUN: | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-INST smstart // CHECK-INST: smstart // CHECK-ENCODING: [0x7f,0x47,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503477f msr S0_3_C4_C7_3, xzr smstart sm // CHECK-INST: smstart sm // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503437f msr S0_3_C4_C3_3, xzr smstart za // CHECK-INST: smstart za // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503457f msr S0_3_C4_C5_3, xzr smstart SM // CHECK-INST: smstart sm // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503437f msr S0_3_C4_C3_3, xzr smstart ZA // CHECK-INST: smstart za // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503457f msr S0_3_C4_C5_3, xzr diff --git a/llvm/test/MC/AArch64/SME/smstop.s b/llvm/test/MC/AArch64/SME/smstop.s index b7c21d939e9a..df9d1118d96f 100644 --- a/llvm/test/MC/AArch64/SME/smstop.s +++ b/llvm/test/MC/AArch64/SME/smstop.s @@ -1,38 +1,28 @@ // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST -// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ -// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \ // RUN: | llvm-objdump -d --mattr=+sme - | FileCheck %s --check-prefix=CHECK-INST // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \ -// RUN: | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// RUN: | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-INST smstop // CHECK-INST: smstop // CHECK-ENCODING: [0x7f,0x46,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503467f msr S0_3_C4_C6_3, xzr smstop sm // CHECK-INST: smstop sm // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503427f msr S0_3_C4_C2_3, xzr smstop za // CHECK-INST: smstop za // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503447f msr S0_3_C4_C4_3, xzr smstop SM // CHECK-INST: smstop sm // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503427f msr S0_3_C4_C2_3, xzr smstop ZA // CHECK-INST: smstop za // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5] -// CHECK-ERROR: instruction requires: sme -// CHECK-UNKNOWN: d503447f msr S0_3_C4_C4_3, xzr diff --git a/llvm/test/MC/AArch64/SME/system-regs.s b/llvm/test/MC/AArch64/SME/system-regs.s index 48033d026c44..093b64a70fb9 100644 --- a/llvm/test/MC/AArch64/SME/system-regs.s +++ b/llvm/test/MC/AArch64/SME/system-regs.s @@ -119,37 +119,37 @@ msr SVCRSM, #0 // CHECK-INST: smstop sm // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503427f msr S0_3_C4_C2_3, xzr +// CHECK-UNKNOWN: d503427f smstop sm msr SVCRSM, #1 // CHECK-INST: smstart // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503437f msr S0_3_C4_C3_3, xzr +// CHECK-UNKNOWN: d503437f smstart msr SVCRZA, #0 // CHECK-INST: smstop za // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503447f msr S0_3_C4_C4_3, xzr +// CHECK-UNKNOWN: d503447f smstop za msr SVCRZA, #1 // CHECK-INST: smstart za // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503457f msr S0_3_C4_C5_3, xzr +// CHECK-UNKNOWN: d503457f smstart za msr SVCRSMZA, #0 // CHECK-INST: smstop // CHECK-ENCODING: [0x7f,0x46,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503467f msr S0_3_C4_C6_3, xzr +// CHECK-UNKNOWN: d503467f smstop msr SVCRSMZA, #1 // CHECK-INST: smstart // CHECK-ENCODING: [0x7f,0x47,0x03,0xd5] // CHECK-ERROR: expected writable system register or pstate -// CHECK-UNKNOWN: d503477f msr S0_3_C4_C7_3, xzr +// CHECK-UNKNOWN: d503477f smstart msr TPIDR2_EL0, x3 // CHECK-INST: msr TPIDR2_EL0, x3 -- Gitee From e2dc00942534395d3b6658028b30c6da1af6a5d6 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 23 May 2023 16:42:56 +0100 Subject: [PATCH 06/77] [Clang][AArch64] Add/implement ACLE keywords for SME. This patch adds all the language-level function keywords defined in: https://github.com/ARM-software/acle/pull/188 (merged) https://github.com/ARM-software/acle/pull/261 (update after D148700 landed) The keywords are used to control PSTATE.ZA and PSTATE.SM, which are respectively used for enabling the use of the ZA matrix array and Streaming mode. This information needs to be available on call sites, since the use of ZA or streaming mode may have to be enabled or disabled around the call-site (depending on the IR attributes set on the caller and the callee). For calls to functions from a function pointer, there is no IR declaration available, so the IR attributes must be added explicitly to the call-site. With the exception of '__arm_locally_streaming' and '__arm_new_za' the information is part of the function's interface, not just the function definition, and thus needs to be propagated through the FunctionProtoType::ExtProtoInfo. This patch adds the defintions of these keywords, as well as codegen and semantic analysis to ensure conversions between function pointers are valid and that no conflicting keywords are set. For example, '__arm_streaming' and '__arm_streaming_compatible' are mutually exclusive. Differential Revision: https://reviews.llvm.org/D127762 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/AST/Type.h | 53 ++- clang/include/clang/AST/TypeProperties.td | 4 + clang/include/clang/Basic/Attr.td | 33 ++ clang/include/clang/Basic/AttrDocs.td | 145 +++++++-- .../clang/Basic/DiagnosticSemaKinds.td | 4 + clang/include/clang/Sema/Sema.h | 10 + clang/lib/AST/Type.cpp | 21 +- clang/lib/AST/TypePrinter.cpp | 24 ++ clang/lib/CodeGen/CGCall.cpp | 15 + clang/lib/CodeGen/CodeGenModule.cpp | 8 + clang/lib/Sema/SemaDecl.cpp | 10 + clang/lib/Sema/SemaDeclAttr.cpp | 33 +- clang/lib/Sema/SemaDeclCXX.cpp | 10 + clang/lib/Sema/SemaExpr.cpp | 38 +++ clang/lib/Sema/SemaOverload.cpp | 20 ++ clang/lib/Sema/SemaType.cpp | 74 ++++- clang/test/AST/ast-dump-sme-attributes.cpp | 66 ++++ .../aarch64-sme-attrs.cpp | 303 +++++++++++++++++ clang/test/Modules/aarch64-sme-keywords.cppm | 65 ++++ clang/test/Sema/aarch64-sme-func-attrs.c | 308 ++++++++++++++++++ 20 files changed, 1210 insertions(+), 34 deletions(-) create mode 100644 clang/test/AST/ast-dump-sme-attributes.cpp create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp create mode 100644 clang/test/Modules/aarch64-sme-keywords.cppm create mode 100644 clang/test/Sema/aarch64-sme-func-attrs.c diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 8d20d088bb63..6f843015f85f 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -3966,6 +3966,19 @@ public: /// because TrailingObjects cannot handle repeated types. struct ExceptionType { QualType Type; }; + /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number + /// of function type attributes that can be set on function types, including + /// function pointers. + enum AArch64SMETypeAttributes : unsigned { + SME_NormalFunction = 0, + SME_PStateSMEnabledMask = 1 << 0, + SME_PStateSMCompatibleMask = 1 << 1, + SME_PStateZASharedMask = 1 << 2, + SME_PStateZAPreservedMask = 1 << 3, + SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of the + // bitmask in FunctionTypeExtraBitfields. + }; + /// A simple holder for various uncommon bits which do not fit in /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the /// alignment of subsequent objects in TrailingObjects. @@ -3973,7 +3986,13 @@ public: /// The number of types in the exception specification. /// A whole unsigned is not needed here and according to /// [implimits] 8 bits would be enough here. - uint16_t NumExceptionType = 0; + unsigned NumExceptionType : 10; + + /// Any AArch64 SME ACLE type attributes that need to be propagated + /// on declarations and function pointers. + unsigned AArch64SMEAttributes : 6; + FunctionTypeExtraBitfields() + : NumExceptionType(0), AArch64SMEAttributes(SME_NormalFunction) {} }; protected: @@ -4152,18 +4171,22 @@ public: /// the various bits of extra information about a function prototype. struct ExtProtoInfo { FunctionType::ExtInfo ExtInfo; - bool Variadic : 1; - bool HasTrailingReturn : 1; + unsigned Variadic : 1; + unsigned HasTrailingReturn : 1; + unsigned AArch64SMEAttributes : 6; Qualifiers TypeQuals; RefQualifierKind RefQualifier = RQ_None; ExceptionSpecInfo ExceptionSpec; const ExtParameterInfo *ExtParameterInfos = nullptr; SourceLocation EllipsisLoc; - ExtProtoInfo() : Variadic(false), HasTrailingReturn(false) {} + ExtProtoInfo() + : Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo(CallingConv CC) - : ExtInfo(CC), Variadic(false), HasTrailingReturn(false) {} + : ExtInfo(CC), Variadic(false), HasTrailingReturn(false), + AArch64SMEAttributes(SME_NormalFunction) {} ExtProtoInfo withExceptionSpec(const ExceptionSpecInfo &ESI) { ExtProtoInfo Result(*this); @@ -4172,7 +4195,15 @@ public: } bool requiresFunctionProtoTypeExtraBitfields() const { - return ExceptionSpec.Type == EST_Dynamic; + return ExceptionSpec.Type == EST_Dynamic || + AArch64SMEAttributes != SME_NormalFunction; + } + + void setArmSMEAttribute(AArch64SMETypeAttributes Kind, bool Enable = true) { + if (Enable) + AArch64SMEAttributes |= Kind; + else + AArch64SMEAttributes &= ~Kind; } }; @@ -4299,6 +4330,7 @@ public: EPI.TypeQuals = getMethodQuals(); EPI.RefQualifier = getRefQualifier(); EPI.ExtParameterInfos = getExtParameterInfosOrNull(); + EPI.AArch64SMEAttributes = getAArch64SMEAttributes(); return EPI; } @@ -4480,6 +4512,15 @@ public: return getTrailingObjects(); } + /// Return a bitmask describing the SME attributes on the function type, see + /// AArch64SMETypeAttributes for their values. + unsigned getAArch64SMEAttributes() const { + if (!hasExtraBitfields()) + return SME_NormalFunction; + return getTrailingObjects() + ->AArch64SMEAttributes; + } + ExtParameterInfo getExtParameterInfo(unsigned I) const { assert(I < getNumParams() && "parameter index out of range"); if (hasExtParameterInfos()) diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 3cc826c1463a..682c869b0c58 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -323,6 +323,9 @@ let Class = FunctionProtoType in { ? node->getExtParameterInfos() : llvm::ArrayRef() }]; } + def : Property<"AArch64SMEAttributes", UInt32> { + let Read = [{ node->getAArch64SMEAttributes() }]; + } def : Creator<[{ auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm, @@ -338,6 +341,7 @@ let Class = FunctionProtoType in { epi.ExceptionSpec = exceptionSpecifier; epi.ExtParameterInfos = extParameterInfo.empty() ? nullptr : extParameterInfo.data(); + epi.AArch64SMEAttributes = AArch64SMEAttributes; return ctx.getFunctionType(returnType, parameters, epi); }]>; } diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 1cd5d8a1f552..845286606777 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2439,6 +2439,39 @@ def ArmStreaming : TypeAttr, TargetSpecificAttr { let Documentation = [ArmSmeStreamingDocs]; } +def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_streaming_compatible">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeStreamingCompatibleDocs]; +} + +def ArmSharedZA : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_shared_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmeSharedZADocs]; +} + +def ArmPreservesZA : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_preserves_za">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmSmePreservesZADocs]; +} + +def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_locally_streaming">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmSmeLocallyStreamingDocs]; +} + +def ArmNewZA : InheritableAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_new_za">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmSmeNewZADocs]; +} +def : MutualExclusions<[ArmNewZA, ArmSharedZA]>; +def : MutualExclusions<[ArmNewZA, ArmPreservesZA]>; + + def Pure : InheritableAttr { let Spellings = [GCC<"pure">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 50efb1d0ba76..708d0b0fcfcb 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6578,40 +6578,147 @@ Requirements on Development Tools - Engineering Specification Documentation }]; } -def ArmSmeStreamingDocs : Documentation { - let Category = DocCatType; +def DocCatArmSmeAttributes : DocumentationCategory<"AArch64 SME Attributes"> { let Content = [{ -.. Note:: This attribute has not been implemented yet, but once it is - implemented, it will behave as described below. +Clang supports a number of AArch64-specific attributes to manage state +added by the Scalable Matrix Extension (SME). This state includes the +runtime mode that the processor is in (e.g. non-streaming or streaming) +as well as the state of the ``ZA`` Matrix Storage. -The ``__arm_streaming`` keyword is only available on AArch64 targets. -It applies to function types and specifies that the function has a -"streaming interface". This means that: +The attributes come in the form of type- and declaration attributes: -* the function requires the Scalable Matrix Extension (SME) +* The SME declaration attributes can appear anywhere that a standard + ``[[...]]`` declaration attribute can appear. -* the function must be entered in streaming mode (that is, with PSTATE.SM - set to 1) +* The SME type attributes apply only to prototyped functions and can appear + anywhere that a standard ``[[...]]`` type attribute can appear. The SME + type attributes do not apply to functions having a K&R-style + unprototyped function type. -* the function must return in streaming mode - -* the function does not have a K&R-style unprototyped function type. +See `Arm C Language Extensions `_ +for more details about the features related to the SME extension. See `Procedure Call Standard for the Arm® 64-bit Architecture (AArch64) `_ for more details about -streaming-interface functions. +streaming-interface functions and shared/private-ZA interface functions. + }]; +} + +def ArmSmeStreamingDocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_streaming`` keyword applies to prototyped function types and specifies +that the function has a "streaming interface". This means that: + +* the function requires that the processor implements the Scalable Matrix + Extension (SME). + +* the function must be entered in streaming mode (that is, with PSTATE.SM + set to 1) + +* the function must return in streaming mode Clang manages PSTATE.SM automatically; it is not the source code's -responsibility to do this. For example, if a normal non-streaming +responsibility to do this. For example, if a non-streaming function calls an ``__arm_streaming`` function, Clang generates code that switches into streaming mode before calling the function and switches back to non-streaming mode on return. + }]; +} + +def ArmSmeStreamingCompatibleDocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_streaming_compatible`` keyword applies to prototyped function types and +specifies that the function has a “streaming compatible interface”. This +means that: -``__arm_streaming`` can appear anywhere that a standard ``[[...]]`` type -attribute can appear. +* the function may be entered in either non-streaming mode (PSTATE.SM=0) or + in streaming mode (PSTATE.SM=1). -See `Arm C Language Extensions `_ -for more details about this extension, and for other related SME features. +* the function must return in the same mode as it was entered. + +* the code executed in the function is compatible with either mode. + +Clang manages PSTATE.SM automatically; it is not the source code's +responsibility to do this. Clang will ensure that the generated code in +streaming-compatible functions is valid in either mode (PSTATE.SM=0 or +PSTATE.SM=1). For example, if an ``__arm_streaming_compatible`` function calls a +non-streaming function, Clang generates code to temporarily switch out of streaming +mode before calling the function and switch back to streaming-mode on return if +``PSTATE.SM`` is ``1`` on entry of the caller. If ``PSTATE.SM`` is ``0`` on +entry to the ``__arm_streaming_compatible`` function, the call will be executed +without changing modes. + }]; +} + +def ArmSmeSharedZADocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_shared_za`` keyword applies to prototyped function types and specifies +that the function shares SME's matrix storage (ZA) with its caller. This +means that: + +* the function requires that the processor implements the Scalable Matrix + Extension (SME). + +* the function enters with ZA in an active state. + +* the function returns with ZA in an active state. + }]; +} + +def ArmSmePreservesZADocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_preserves_za`` keyword applies to prototyped function types and +specifies that the function does not modify ZA state. + }]; +} + + +def ArmSmeLocallyStreamingDocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_locally_streaming`` keyword applies to function declarations +and specifies that all the statements in the function are executed in +streaming mode. This means that: + +* the function requires that the target processor implements the Scalable Matrix + Extension (SME). + +* the program automatically puts the machine into streaming mode before + executing the statements and automatically restores the previous mode + afterwards. + +Clang manages PSTATE.SM automatically; it is not the source code's +responsibility to do this. For example, Clang will emit code to enable +streaming mode at the start of the function, and disable streaming mode +at the end of the function. + }]; +} + +def ArmSmeNewZADocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_new_za`` keyword applies to function declarations and specifies +that the function will be set up with a fresh ZA context. + +This means that: + +* the function requires that the target processor implements the Scalable Matrix + Extension (SME). + +* the function will commit any lazily saved ZA data. + +* the function will create a new ZA context and enable PSTATE.ZA. + +* the function will disable PSTATE.ZA (by setting it to 0) before returning. + +For ``__arm_new_za`` functions Clang will set up the ZA context automatically +on entry to the function, and disable it before returning. For example, if ZA is +in a dormant state Clang will generate the code to commit a lazy-save and set up +a new ZA state before executing user code. }]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 2c10ce943229..a174daabc8cd 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2035,6 +2035,10 @@ def err_different_return_type_for_overriding_virtual_function : Error< "than the function it overrides}1,2">; def note_overridden_virtual_function : Note< "overridden virtual function is here">; +def err_conflicting_overriding_attributes : Error< + "virtual function %0 has different attributes " + "%diff{($) than the function it overrides (which has $)|" + "than the function it overrides}1,2">; def err_conflicting_overriding_cc_attributes : Error< "virtual function %0 has different calling convention attributes " "%diff{($) than the function it overrides (which has calling convention $)|" diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index b2ab6d0f8445..588fdb15b2a0 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -7076,6 +7076,16 @@ public: NestedNameSpecInfo &IdInfo, bool EnteringContext); + /// The kind of conversion to check for. Either all attributes must match exactly, + /// or the converted type may add/drop '__arm_preserves_za'. + enum class AArch64SMECallConversionKind { + MatchExactly, + MayAddPreservesZA, + MayDropPreservesZA, + }; + bool IsInvalidSMECallConversion(QualType FromType, QualType ToType, + AArch64SMECallConversionKind C); + /// The parser has parsed a nested-name-specifier /// 'template[opt] template-name < template-args >::'. /// diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 99c859034423..f0ab7b303541 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -3389,12 +3389,19 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, argSlot[i] = params[i]; } + // Propagate the SME ACLE attributes. + if (epi.AArch64SMEAttributes != SME_NormalFunction) { + auto &ExtraBits = *getTrailingObjects(); + assert(epi.AArch64SMEAttributes <= SME_AttributeMask && + "Not enough bits to encode SME attributes"); + ExtraBits.AArch64SMEAttributes = epi.AArch64SMEAttributes; + } + // Fill in the exception type array if present. if (getExceptionSpecType() == EST_Dynamic) { auto &ExtraBits = *getTrailingObjects(); size_t NumExceptions = epi.ExceptionSpec.Exceptions.size(); - assert(NumExceptions <= UINT16_MAX && - "Not enough bits to encode exceptions"); + assert(NumExceptions <= 1023 && "Not enough bits to encode exceptions"); ExtraBits.NumExceptionType = NumExceptions; assert(hasExtraBitfields() && "missing trailing extra bitfields!"); @@ -3551,8 +3558,11 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result, // This is followed by an optional "consumed argument" section of the // same length as the first type sequence: // bool* - // Finally, we have the ext info and trailing return type flag: - // int bool + // This is followed by the ext info: + // int + // Finally we have a trailing return type flag (bool) + // combined with AArch64 SME Attributes, to save space: + // int // // There is no ambiguity between the consumed arguments and an empty EH // spec because of the leading 'bool' which unambiguously indicates @@ -3585,8 +3595,9 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result, for (unsigned i = 0; i != NumParams; ++i) ID.AddInteger(epi.ExtParameterInfos[i].getOpaqueValue()); } + epi.ExtInfo.Profile(ID); - ID.AddBoolean(epi.HasTrailingReturn); + ID.AddInteger((epi.AArch64SMEAttributes << 1) | epi.HasTrailingReturn); } void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 1b62f6630928..eb69d0bb8755 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -938,6 +938,15 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T, FunctionType::ExtInfo Info = T->getExtInfo(); + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask)) + OS << " __arm_streaming_compatible"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)) + OS << " __arm_streaming"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask)) + OS << " __arm_shared_za"; + if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask)) + OS << " __arm_preserves_za"; + printFunctionAfter(Info, OS); if (!T->getMethodQuals().empty()) @@ -1772,6 +1781,18 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, OS << "__arm_streaming"; return; } + if (T->getAttrKind() == attr::ArmStreamingCompatible) { + OS << "__arm_streaming_compatible"; + return; + } + if (T->getAttrKind() == attr::ArmSharedZA) { + OS << "__arm_shared_za"; + return; + } + if (T->getAttrKind() == attr::ArmPreservesZA) { + OS << "__arm_preserves_za"; + return; + } OS << " __attribute__(("; switch (T->getAttrKind()) { @@ -1814,6 +1835,9 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::AnnotateType: case attr::WebAssemblyFuncref: case attr::ArmStreaming: + case attr::ArmStreamingCompatible: + case attr::ArmSharedZA: + case attr::ArmPreservesZA: llvm_unreachable("This attribute should have been handled already"); case attr::NSReturnsRetained: diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 6b8af9bf18c1..265490ccfa9b 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1762,6 +1762,15 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx, if (!isUnresolvedExceptionSpec(FPT->getExceptionSpecType()) && FPT->isNothrow()) FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); + + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_enabled"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_shared"); + if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask) + FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); } static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs, @@ -2402,6 +2411,12 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_sm_body"); + + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("aarch64_pstate_za_new"); } // Attach "no-builtins" attributes to: diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index f09d1129b128..722d3dc2c63e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2288,6 +2288,14 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, return; } + // Handle SME attributes that apply to function definitions, + // rather than to function prototypes. + if (D->hasAttr()) + B.addAttribute("aarch64_pstate_sm_body"); + + if (D->hasAttr()) + B.addAttribute("aarch64_pstate_za_new"); + // Track whether we need to add the optnone LLVM attribute, // starting with the default for this optimization level. bool ShouldAddOptNone = diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index d0b86ee97544..9a4153feb10b 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3758,6 +3758,16 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S, } } + // It is not permitted to redeclare an SME function with different SME + // attributes. + if (IsInvalidSMECallConversion(Old->getType(), New->getType(), + AArch64SMECallConversionKind::MatchExactly)) { + Diag(New->getLocation(), diag::err_sme_attr_mismatch) + << New->getType() << Old->getType(); + Diag(OldLocation, diag::note_previous_declaration); + return true; + } + // If a function is first declared with a calling convention, but is later // declared or defined without one, all following decls assume the calling // convention of the first. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index ed69e802c95d..7f49517f0f1a 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5349,9 +5349,6 @@ bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC, case ParsedAttr::AT_AArch64SVEPcs: CC = CC_AArch64SVEPCS; break; - case ParsedAttr::AT_ArmStreaming: - CC = CC_C; // FIXME: placeholder until real SME support is added. - break; case ParsedAttr::AT_AMDGPUKernelCall: CC = CC_AMDGPUKernelCall; break; @@ -8707,6 +8704,28 @@ static bool MustDelayAttributeArguments(const ParsedAttr &AL) { return false; } + +static void handleArmNewZaAttr(Sema &S, Decl *D, const ParsedAttr &AL) { + if (auto *FPT = dyn_cast(D->getFunctionType())) { + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateZASharedMask) { + S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) + << AL << "'__arm_shared_za'" << true; + AL.setInvalid(); + } + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateZAPreservedMask) { + S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) + << AL << "'__arm_preserves_za'" << true; + AL.setInvalid(); + } + if (AL.isInvalid()) + return; + } + + handleSimpleAttribute(S, D, AL); +} + /// ProcessDeclAttribute - Apply the specific attribute to the specified decl if /// the attribute applies to decls. If the attribute is a type attribute, just /// silently ignore it if a GNU attribute. @@ -9462,6 +9481,14 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, handleArmBuiltinAliasAttr(S, D, AL); break; + case ParsedAttr::AT_ArmLocallyStreaming: + handleSimpleAttribute(S, D, AL); + break; + + case ParsedAttr::AT_ArmNewZA: + handleArmNewZaAttr(S, D, AL); + break; + case ParsedAttr::AT_AcquireHandle: handleAcquireHandleAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index b62f3c475c45..d42588932629 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17985,6 +17985,16 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New, } } + // SME attributes must match when overriding a function declaration. + if (IsInvalidSMECallConversion( + Old->getType(), New->getType(), + AArch64SMECallConversionKind::MayAddPreservesZA)) { + Diag(New->getLocation(), diag::err_conflicting_overriding_attributes) + << New << New->getType() << Old->getType(); + Diag(Old->getLocation(), diag::note_overridden_virtual_function); + return true; + } + // Virtual overrides must have the same code_seg. const auto *OldCSA = Old->getAttr(); const auto *NewCSA = New->getAttr(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 3a5e302cc03a..e82392069fa7 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9679,6 +9679,40 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc, ColonLoc, result, VK, OK); } +// Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible. +bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType, + AArch64SMECallConversionKind C) { + unsigned FromAttributes = 0, ToAttributes = 0; + if (const auto *FromFn = + dyn_cast(Context.getCanonicalType(FromType))) + FromAttributes = + FromFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask; + if (const auto *ToFn = + dyn_cast(Context.getCanonicalType(ToType))) + ToAttributes = + ToFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask; + + if (FromAttributes == ToAttributes) + return false; + + // If the '__arm_preserves_za' is the only difference between the types, + // check whether we're allowed to add or remove it. + if ((FromAttributes ^ ToAttributes) == + FunctionType::SME_PStateZAPreservedMask) { + switch (C) { + case AArch64SMECallConversionKind::MatchExactly: + return true; + case AArch64SMECallConversionKind::MayAddPreservesZA: + return !(ToAttributes & FunctionType::SME_PStateZAPreservedMask); + case AArch64SMECallConversionKind::MayDropPreservesZA: + return !(FromAttributes & FunctionType::SME_PStateZAPreservedMask); + } + } + + // There has been a mismatch of attributes + return true; +} + // Check if we have a conversion between incompatible cmse function pointer // types, that is, a conversion between a function pointer with the // cmse_nonsecure_call attribute and one without. @@ -9845,6 +9879,10 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType, return Sema::IncompatibleFunctionPointer; if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans)) return Sema::IncompatibleFunctionPointer; + if (S.IsInvalidSMECallConversion( + rtrans, ltrans, + Sema::AArch64SMECallConversionKind::MayDropPreservesZA)) + return Sema::IncompatibleFunctionPointer; return ConvTy; } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index aef8dc58a48d..37cec76ec116 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1682,6 +1682,26 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType, Changed = true; } + // Drop the 'arm_preserves_za' if not present in the target type (we can do + // that because it is merely a hint). + if (const auto *FromFPT = dyn_cast(FromFn)) { + FunctionProtoType::ExtProtoInfo ExtInfo = FromFPT->getExtProtoInfo(); + if (ExtInfo.AArch64SMEAttributes & + FunctionType::SME_PStateZAPreservedMask) { + unsigned ToFlags = 0; + if (const auto *ToFPT = dyn_cast(ToFn)) + ToFlags = ToFPT->getExtProtoInfo().AArch64SMEAttributes; + if (!(ToFlags & FunctionType::SME_PStateZAPreservedMask)) { + ExtInfo.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask, + false); + QualType QT = Context.getFunctionType( + FromFPT->getReturnType(), FromFPT->getParamTypes(), ExtInfo); + FromFn = QT->getAs(); + Changed = true; + } + } + } + // Drop 'noexcept' if not present in target type. if (const auto *FromFPT = dyn_cast(FromFn)) { const auto *ToFPT = cast(ToFn); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 0aa691d24171..2ccad19e066d 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -128,7 +128,6 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, case ParsedAttr::AT_VectorCall: \ case ParsedAttr::AT_AArch64VectorPcs: \ case ParsedAttr::AT_AArch64SVEPcs: \ - case ParsedAttr::AT_ArmStreaming: \ case ParsedAttr::AT_AMDGPUKernelCall: \ case ParsedAttr::AT_MSABI: \ case ParsedAttr::AT_SysVABI: \ @@ -143,6 +142,10 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, case ParsedAttr::AT_NoReturn: \ case ParsedAttr::AT_Regparm: \ case ParsedAttr::AT_CmseNSCall: \ + case ParsedAttr::AT_ArmStreaming: \ + case ParsedAttr::AT_ArmStreamingCompatible: \ + case ParsedAttr::AT_ArmSharedZA: \ + case ParsedAttr::AT_ArmPreservesZA: \ case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \ case ParsedAttr::AT_AnyX86NoCfCheck: \ CALLING_CONV_ATTRS_CASELIST @@ -7772,6 +7775,26 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) { llvm_unreachable("unexpected attribute kind!"); } +static bool checkMutualExclusion(TypeProcessingState &state, + const FunctionProtoType::ExtProtoInfo &EPI, + ParsedAttr &Attr, + AttributeCommonInfo::Kind OtherKind) { + auto OtherAttr = std::find_if( + state.getCurrentAttributes().begin(), state.getCurrentAttributes().end(), + [OtherKind](const ParsedAttr &A) { return A.getKind() == OtherKind; }); + if (OtherAttr == state.getCurrentAttributes().end() || OtherAttr->isInvalid()) + return false; + + Sema &S = state.getSema(); + S.Diag(Attr.getLoc(), diag::err_attributes_are_not_compatible) + << *OtherAttr << Attr + << (OtherAttr->isRegularKeywordAttribute() || + Attr.isRegularKeywordAttribute()); + S.Diag(OtherAttr->getLoc(), diag::note_conflicting_attribute); + Attr.setInvalid(); + return true; +} + /// Process an individual function attribute. Returns true to /// indicate that the attribute was handled, false if it wasn't. static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, @@ -7901,6 +7924,55 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, return true; } + if (attr.getKind() == ParsedAttr::AT_ArmStreaming || + attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible || + attr.getKind() == ParsedAttr::AT_ArmSharedZA || + attr.getKind() == ParsedAttr::AT_ArmPreservesZA){ + if (S.CheckAttrTarget(attr) || S.CheckAttrNoArgs(attr)) + return true; + + if (!unwrapped.isFunctionType()) + return false; + + const auto *FnTy = unwrapped.get()->getAs(); + if (!FnTy) { + // SME ACLE attributes are not supported on K&R-style unprototyped C + // functions. + S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) << + attr << attr.isRegularKeywordAttribute() << ExpectedFunctionWithProtoType; + attr.setInvalid(); + return false; + } + + FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo(); + switch (attr.getKind()) { + case ParsedAttr::AT_ArmStreaming: + if (checkMutualExclusion(state, EPI, attr, + ParsedAttr::AT_ArmStreamingCompatible)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMEnabledMask); + break; + case ParsedAttr::AT_ArmStreamingCompatible: + if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmStreaming)) + return true; + EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask); + break; + case ParsedAttr::AT_ArmSharedZA: + EPI.setArmSMEAttribute(FunctionType::SME_PStateZASharedMask); + break; + case ParsedAttr::AT_ArmPreservesZA: + EPI.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask); + break; + default: + llvm_unreachable("Unsupported attribute"); + } + + QualType newtype = S.Context.getFunctionType(FnTy->getReturnType(), + FnTy->getParamTypes(), EPI); + type = unwrapped.wrap(S, newtype->getAs()); + return true; + } + if (attr.getKind() == ParsedAttr::AT_NoThrow) { // Delay if this is not a function type. if (!unwrapped.isFunctionType()) diff --git a/clang/test/AST/ast-dump-sme-attributes.cpp b/clang/test/AST/ast-dump-sme-attributes.cpp new file mode 100644 index 000000000000..6581fd4baba9 --- /dev/null +++ b/clang/test/AST/ast-dump-sme-attributes.cpp @@ -0,0 +1,66 @@ +// Test without serialization: +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s + +// Test with serialization: +// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++20 -triple aarch64 -target-feature +sme -include-pch %t -ast-dump-all -ast-dump-filter Foo /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck --strict-whitespace %s + +struct Foo { +// CHECK: |-CXXRecordDecl {{.*}} implicit struct Foo +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming 'void () __arm_streaming' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming_compatible 'void () __arm_streaming_compatible' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_locally_streaming 'void ()' +// CHECK-NEXT: | `-ArmLocallyStreamingAttr +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_shared_za' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_new_za 'void ()' +// CHECK-NEXT: | `-ArmNewZAAttr +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves_za' + void f_streaming() __arm_streaming; + void f_streaming_compatible() __arm_streaming_compatible; + __arm_locally_streaming void f_locally_streaming(); + void f_shared_za() __arm_shared_za; + __arm_new_za void f_new_za(); + void f_preserves_za() __arm_preserves_za; + + +// CHECK: |-CXXMethodDecl {{.*}} test_lambda 'int (int)' implicit-inline +// CHECK: `-CompoundStmt +// CHECK-NEXT: |-DeclStmt +// CHECK-NEXT: | `-VarDecl +// CHECK-NEXT: | `-LambdaExpr +// CHECK-NEXT: | |-CXXRecordDecl +// CHECK: | | |-CXXMethodDecl {{.*}} used constexpr operator() 'int (int) __arm_streaming const' inline +// CHECK: | | |-CXXConversionDecl {{.*}} implicit constexpr operator int (*)(int) __arm_streaming 'int (*() const noexcept)(int) __arm_streaming' inline +// CHECK: | | |-CXXMethodDecl {{.*}} implicit __invoke 'int (int) __arm_streaming' static inline +// CHECK: `-ReturnStmt +// CHECK: `-CXXOperatorCallExpr +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int (*)(int) __arm_streaming const' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int (int) __arm_streaming const' lvalue CXXMethod {{.*}} 'operator()' 'int (int) __arm_streaming const' + int test_lambda(int x) { + auto F = [](int x) __arm_streaming { return x; }; + return F(x); + } + +// CHECK: |-TypedefDecl {{.*}} referenced s_ptrty 'void (*)(int, int) __arm_streaming' +// CHECK-NEXT: | `-PointerType {{.*}} 'void (*)(int, int) __arm_streaming' +// CHECK-NEXT: | `-ParenType {{.*}} 'void (int, int) __arm_streaming' sugar +// CHECK-NEXT: | `-FunctionProtoType {{.*}} 'void (int, int) __arm_streaming' cdecl + typedef void (*s_ptrty) (int, int) __arm_streaming; + +// CHECK: `-CXXMethodDecl {{.*}} test_streaming_ptrty 'void (s_ptrty, int, int)' implicit-inline +// CHECK-NEXT: |-ParmVarDecl {{.*}} used f 's_ptrty':'void (*)(int, int) __arm_streaming' +// CHECK-NEXT: |-ParmVarDecl {{.*}} used x 'int' +// CHECK-NEXT: |-ParmVarDecl {{.*}} used y 'int' +// CHECK: `-CompoundStmt +// CHECK-NEXT: `-ReturnStmt +// CHECK-NEXT: `-CallExpr +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 's_ptrty':'void (*)(int, int) __arm_streaming' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 's_ptrty':'void (*)(int, int) __arm_streaming' lvalue ParmVar {{.*}} 'f' 's_ptrty':'void (*)(int, int) __arm_streaming' +// CHECK-NEXT: |-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'x' 'int' +// CHECK-NEXT: `-ImplicitCastExpr {{.*}} 'int' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'y' 'int' + void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); }; +}; diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp new file mode 100644 index 000000000000..52937495484d --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -0,0 +1,303 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \ +// RUN: -S -disable-O0-optnone -Werror -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=inline \ +// RUN: | FileCheck %s + +extern "C" { + +extern int normal_callee(); + +// == FUNCTION DECLARATIONS == + +int streaming_decl(void) __arm_streaming; +int streaming_compatible_decl(void) __arm_streaming_compatible; +int shared_za_decl(void) __arm_shared_za; +int preserves_za_decl(void) __arm_preserves_za; +int private_za_decl(void); + +// == FUNCTION DEFINITIONS == + +// CHECK-LABEL: @streaming_caller() +// CHECK-SAME: #[[SM_ENABLED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// + int streaming_caller() __arm_streaming { + return normal_callee(); +} + +// CHECK: declare i32 @normal_callee() #[[NORMAL_DECL:[0-9]+]] + + +// CHECK-LABEL: @streaming_callee() +// CHECK-SAME: #[[SM_ENABLED]] +// CHECK: call i32 @streaming_decl() #[[SM_ENABLED_CALL:[0-9]+]] +// + int streaming_callee() __arm_streaming { + return streaming_decl(); +} + +// CHECK: declare i32 @streaming_decl() #[[SM_ENABLED_DECL:[0-9]+]] + +// CHECK-LABEL: @streaming_compatible_caller() +// CHECK-SAME: #[[SM_COMPATIBLE:[0-9]+]] +// CHECK: call i32 @normal_callee() +// + int streaming_compatible_caller() __arm_streaming_compatible { + return normal_callee(); +} + +// CHECK-LABEL: @streaming_compatible_callee() +// CHECK-SAME: #[[SM_COMPATIBLE]] +// CHECK: call i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_CALL:[0-9]+]] +// + int streaming_compatible_callee() __arm_streaming_compatible { + return streaming_compatible_decl(); +} + +// CHECK: declare i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_DECL:[0-9]+]] + +// CHECK-LABEL: @locally_streaming_caller() +// CHECK-SAME: #[[SM_BODY:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__arm_locally_streaming int locally_streaming_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @locally_streaming_callee() +// CHECK-SAME: #[[SM_BODY]] +// CHECK: call i32 @locally_streaming_caller() #[[SM_BODY_CALL:[0-9]+]] +// +__arm_locally_streaming int locally_streaming_callee() { + return locally_streaming_caller(); +} + + +// CHECK-LABEL: @shared_za_caller() +// CHECK-SAME: #[[ZA_SHARED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// + int shared_za_caller() __arm_shared_za { + return normal_callee(); +} + +// CHECK-LABEL: @shared_za_callee() +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]] +// + int shared_za_callee() __arm_shared_za { + return shared_za_decl(); +} + +// CHECK: declare i32 @shared_za_decl() #[[ZA_SHARED_DECL:[0-9]+]] + + +// CHECK-LABEL: @preserves_za_caller() +// CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]] +// CHECK: call i32 @normal_callee() +// + int preserves_za_caller() __arm_preserves_za { + return normal_callee(); +} + +// CHECK-LABEL: @preserves_za_callee() +// CHECK-SAME: #[[ZA_PRESERVED]] +// CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]] +// + int preserves_za_callee() __arm_preserves_za { + return preserves_za_decl(); +} + +// CHECK: declare i32 @preserves_za_decl() #[[ZA_PRESERVED_DECL:[0-9]+]] + + +// CHECK-LABEL: @new_za_caller() +// CHECK-SAME: #[[ZA_NEW:[0-9]+]] +// CHECK: call i32 @normal_callee() +// +__arm_new_za int new_za_caller() { + return normal_callee(); +} + +// CHECK-LABEL: @new_za_callee() +// CHECK-SAME: #[[ZA_NEW]] +// CHECK: call i32 @private_za_decl() +// +__arm_new_za int new_za_callee() { + return private_za_decl(); +} + +// CHECK: declare i32 @private_za_decl() + + +// Ensure that the attributes are correctly propagated to function types +// and also to callsites. +typedef void (*s_ptrty) (int, int) __arm_streaming; +typedef void (*sc_ptrty) (int, int) __arm_streaming_compatible; +typedef void (*sz_ptrty) (int, int) __arm_shared_za; +typedef void (*pz_ptrty) (int, int) __arm_preserves_za; + +// CHECK-LABEL: @test_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_streaming_compatible_ptrty( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_COMPATIBLE_CALL]] +// +void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); } +// CHECK-LABEL: @test_shared_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]] +// +void test_shared_za(sz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); } +// CHECK-LABEL: @test_preserved_za( +// CHECK-SAME: #[[ZA_SHARED]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]] +// +void test_preserved_za(pz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); } + +// CHECK-LABEL: @test_indirect_streaming_ptrty( +// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] +// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]] +// +typedef s_ptrty **indirect_s_ptrty; +void test_indirect_streaming_ptrty(indirect_s_ptrty fptr, int x, int y) { return (**fptr)(x, y); } +} // extern "C" + +// +// Test that having the attribute in different places (on declaration and on type) +// both results in the attribute being applied to the type. +// + +// CHECK-LABEL: @_Z24test_same_type_streamingv( +// CHECK: call void @_Z10streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z10streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming1v() #[[SM_ENABLED_CALL]] +// CHECK: call void @_Z20same_type_streaming2v() #[[SM_ENABLED_CALL]] +// CHECK: ret void +// CHECK: } +// CHECK: declare void @_Z10streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z10streaming2v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming1v() #[[SM_ENABLED_DECL]] +// CHECK: declare void @_Z20same_type_streaming2v() #[[SM_ENABLED_DECL]] +void streaming1(void) __arm_streaming; +void streaming2() __arm_streaming; +decltype(streaming1) same_type_streaming1; +decltype(streaming2) same_type_streaming2; +void test_same_type_streaming() { + streaming1(); + streaming2(); + same_type_streaming1(); + same_type_streaming2(); +} + +// +// Test overloading; the attribute is not required for overloaded types and +// does not apply if not specified. +// + +// CHECK-LABEL: @_Z12overloadedfni( +// CHECK-SAME: #[[SM_ENABLED]] +int overloadedfn(int x) __arm_streaming { return x; } +// CHECK-LABEL: @_Z12overloadedfnf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float overloadedfn(float x) { return x; } +// CHECK-LABEL: @_Z13test_overloadi( +// CHECK-SAME: #[[NORMAL_DEF]] +// +int test_overload(int x) { return overloadedfn(x); } +// CHECK-LABEL: @_Z13test_overloadf( +// CHECK-SAME: #[[NORMAL_DEF]] +// +float test_overload(float x) { return overloadedfn(x); } + +// CHECK-LABEL: @_Z11test_lambdai( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @"_ZZ11test_lambdaiENK3$_0clEi"({{.*}}) #[[SM_ENABLED_CALL]] +// +// CHECK: @"_ZZ11test_lambdaiENK3$_0clEi"( +// CHECK-SAME: #[[SM_ENABLED]] +int test_lambda(int x) { + auto F = [](int x) __arm_streaming { return x; }; + return F(x); +} + +// CHECK-LABEL: @_Z27test_template_instantiationv( +// CHECK-SAME: #[[NORMAL_DEF]] +// CHECK: call noundef i32 @_Z15template_functyIiET_S0_(i32 noundef 12) #[[SM_ENABLED_CALL]] +// +// CHECK: @_Z15template_functyIiET_S0_( +// CHECK-SAME: #[[SM_ENABLED]] +template +Ty template_functy(Ty x) __arm_streaming { return x; } +int test_template_instantiation() { return template_functy(12); } + +// +// Test that arm_locally_streaming is inherited by future redeclarations, +// even when they don't specify the attribute. +// + +// CHECK: define {{.*}} @_Z25locally_streaming_inheritv( +// CHECK-SAME: #[[SM_BODY]] +__arm_locally_streaming void locally_streaming_inherit(); +void locally_streaming_inherit() { + streaming_decl(); +} + +// Test that the attributes are propagated properly to calls +// when using a variadic template as indirection. +__attribute__((always_inline)) +int call() { return 0; } + +template +__attribute__((always_inline)) +int call(T f, Other... other) { + return f() + call(other...); +} + +// CHECK: {{.*}} @_Z22test_variadic_templatev( +// CHECK: call {{.*}} i32 @normal_callee() #[[NOUNWIND_CALL:[0-9]+]] +// CHECK-NEXT: call {{.*}} i32 @streaming_decl() #[[NOUNWIND_SM_ENABLED_CALL:[0-9]+]] +// CHECK-NEXT: call {{.*}} i32 @streaming_compatible_decl() #[[NOUNWIND_SM_COMPATIBLE_CALL:[0-9]+]] +// CHECK-NEXT: call {{.*}} i32 @shared_za_decl() #[[NOUNWIND_ZA_SHARED_CALL:[0-9]+]] +// CHECK-NEXT: call {{.*}} i32 @preserves_za_decl() #[[NOUNWIND_ZA_PRESERVED_CALL:[0-9]+]] +// CHECK-NEXT: add nsw +// CHECK-NEXT: add nsw +// CHECK-NEXT: add nsw +// CHECK-NEXT: add nsw +// CHECK-NEXT: ret +int test_variadic_template() { + return call(normal_callee, + streaming_decl, + streaming_compatible_decl, + shared_za_decl, + preserves_za_decl); +} + +// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } +// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } +// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } +// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" } +// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" } +// CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind } +// CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" } +// CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" } +// CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_pstate_za_shared" } +// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" } + diff --git a/clang/test/Modules/aarch64-sme-keywords.cppm b/clang/test/Modules/aarch64-sme-keywords.cppm new file mode 100644 index 000000000000..6784aaa01d21 --- /dev/null +++ b/clang/test/Modules/aarch64-sme-keywords.cppm @@ -0,0 +1,65 @@ +// REQUIRES: aarch64-registered-target +// +// RUN: rm -rf %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme -fprebuilt-module-path=%t -I%t %t/Use.cpp -emit-llvm +// RUN: cat %t/Use.ll | FileCheck %s + +//--- A.cppm +module; +export module A; + +export void f_streaming(void) __arm_streaming { } +export void f_streaming_compatible(void) __arm_streaming_compatible { } +export void f_shared_za(void) __arm_shared_za { } +export void f_preserves_za(void) __arm_preserves_za { } + +//--- Use.cpp +// expected-no-diagnostics +import A; + +// CHECK: define dso_local void @_Z18f_shared_za_callerv() #[[SHARED_ZA_DEF:[0-9]+]] { +// CHECK: entry: +// CHECK: call void @_ZW1A11f_shared_zav() #[[SHARED_ZA_USE:[0-9]+]] +// CHECK: call void @_ZW1A14f_preserves_zav() #[[PRESERVES_ZA_USE:[0-9]+]] +// CHECK: ret void +// CHECK: } +// +// CHECK:declare void @_ZW1A11f_shared_zav() #[[SHARED_ZA_DECL:[0-9]+]] +// +// CHECK:declare void @_ZW1A14f_preserves_zav() #[[PRESERVES_ZA_DECL:[0-9]+]] +// +// CHECK:; Function Attrs: mustprogress noinline nounwind optnone +// CHECK:define dso_local void @_Z21f_nonstreaming_callerv() #[[NORMAL_DEF:[0-9]+]] { +// CHECK:entry: +// CHECK: call void @_ZW1A11f_streamingv() #[[STREAMING_USE:[0-9]+]] +// CHECK: call void @_ZW1A22f_streaming_compatiblev() #[[STREAMING_COMPATIBLE_USE:[0-9]+]] +// CHECK: ret void +// CHECK:} +// +// CHECK:declare void @_ZW1A11f_streamingv() #[[STREAMING_DECL:[0-9]+]] +// +// CHECK:declare void @_ZW1A22f_streaming_compatiblev() #[[STREAMING_COMPATIBLE_DECL:[0-9]+]] +// +// CHECK-DAG: attributes #[[SHARED_ZA_DEF]] = {{{.*}} "aarch64_pstate_za_shared" {{.*}}} +// CHECK-DAG: attributes #[[SHARED_ZA_DECL]] = {{{.*}} "aarch64_pstate_za_shared" {{.*}}} +// CHECK-DAG: attributes #[[PRESERVES_ZA_DECL]] = {{{.*}} "aarch64_pstate_za_preserved" {{.*}}} +// CHECK-DAG: attributes #[[NORMAL_DEF]] = {{{.*}}} +// CHECK-DAG: attributes #[[STREAMING_DECL]] = {{{.*}} "aarch64_pstate_sm_enabled" {{.*}}} +// CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_DECL]] = {{{.*}} "aarch64_pstate_sm_compatible" {{.*}}} +// CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_pstate_za_shared" } +// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" } +// CHECK-DAG: attributes #[[STREAMING_USE]] = { "aarch64_pstate_sm_enabled" } +// CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_USE]] = { "aarch64_pstate_sm_compatible" } + +void f_shared_za_caller(void) __arm_shared_za { + f_shared_za(); + f_preserves_za(); +} + +void f_nonstreaming_caller(void) { + f_streaming(); + f_streaming_compatible(); +} diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c new file mode 100644 index 000000000000..71140a9ebc61 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -0,0 +1,308 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify=expected-cpp -x c++ %s + +// Valid attributes + +void sme_arm_streaming(void) __arm_streaming; +void sme_arm_streaming_compatible(void) __arm_streaming_compatible; + +__arm_new_za void sme_arm_new_za(void) {} +void sme_arm_shared_za(void) __arm_shared_za; +void sme_arm_preserves_za(void) __arm_preserves_za; + +__arm_new_za void sme_arm_streaming_new_za(void) __arm_streaming {} +void sme_arm_streaming_shared_za(void) __arm_streaming __arm_shared_za; +void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves_za; + +__arm_new_za void sme_arm_sc_new_za(void) __arm_streaming_compatible {} +void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_shared_za; +void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves_za; + +void sme_arm_shared_preserves_za(void) __arm_shared_za __arm_preserves_za; + +__arm_locally_streaming void sme_arm_locally_streaming(void) { } +__arm_locally_streaming void sme_arm_streaming_and_locally_streaming(void) __arm_streaming { } +__arm_locally_streaming void sme_arm_streaming_and_streaming_compatible(void) __arm_streaming_compatible { } + +__arm_locally_streaming __arm_new_za void sme_arm_ls_new_za(void) { } +__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_shared_za { } +__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves_za { } + +// Valid attributes on function pointers + +void streaming_ptr(void) __arm_streaming; +typedef void (*fptrty1) (void) __arm_streaming; +fptrty1 call_streaming_func() { return streaming_ptr; } + +void streaming_compatible_ptr(void) __arm_streaming_compatible; +typedef void (*fptrty2) (void) __arm_streaming_compatible; +fptrty2 call_sc_func() { return streaming_compatible_ptr; } + +void shared_za_ptr(void) __arm_shared_za; +typedef void (*fptrty3) (void) __arm_shared_za; +fptrty3 call_shared_za_func() { return shared_za_ptr; } + +void preserves_za_ptr(void) __arm_preserves_za; +typedef void (*fptrty4) (void) __arm_preserves_za; +fptrty4 call_preserve_za_func() { return preserves_za_ptr; } + +void shared_preserves_za_ptr(void) __arm_shared_za __arm_preserves_za; +typedef void (*fptrty5) (void) __arm_shared_za __arm_preserves_za; +fptrty5 call_shared_preserve_za_func() { return shared_preserves_za_ptr; } + +typedef void (*fptrty6) (void); +fptrty6 cast_nza_func_to_normal() { return sme_arm_new_za; } +fptrty6 cast_ls_func_to_normal() { return sme_arm_locally_streaming; } + +// Invalid attributes + +// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +void streaming_mode(void) __arm_streaming __arm_streaming_compatible; + +// expected-cpp-error@+4 {{'__arm_streaming' and '__arm_streaming_compatible' are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'__arm_streaming' and '__arm_streaming_compatible' are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +void streaming_compatible(void) __arm_streaming_compatible __arm_streaming; + +// expected-cpp-error@+2 {{'__arm_new_za' and '__arm_shared_za' are not compatible}} +// expected-error@+1 {{'__arm_new_za' and '__arm_shared_za' are not compatible}} +__arm_new_za void new_shared_za(void) __arm_shared_za {} + +// expected-cpp-error@+2 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}} +// expected-error@+1 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}} +__arm_new_za void new_preserves_za(void) __arm_preserves_za {} + +// Invalid attributes on function pointers + +// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +void streaming_ptr_invalid(void) __arm_streaming __arm_streaming_compatible; +// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-cpp-note@+3 {{conflicting attribute is here}} +// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}} +// expected-note@+1 {{conflicting attribute is here}} +typedef void (*fptrty7) (void) __arm_streaming __arm_streaming_compatible; +fptrty7 invalid_streaming_func() { return streaming_ptr_invalid; } + +// expected-warning@+2 {{'__arm_streaming' only applies to non-K&R-style functions}} +// expected-error@+1 {{'__arm_streaming' only applies to function types; type here is 'void ()'}} +void function_no_prototype() __arm_streaming; + +// +// Check for incorrect conversions of function pointers with the attributes +// + +typedef void (*n_ptrty) (void); +typedef void (*s_ptrty) (void) __arm_streaming; +s_ptrty return_valid_streaming_fptr(s_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 's_ptrty' (aka 'void (*)() __arm_streaming') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 's_ptrty' (aka 'void (*)(void) __arm_streaming')}} +s_ptrty return_invalid_fptr_streaming_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __arm_streaming')}} +// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __arm_streaming') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_streaming(s_ptrty f) { return f; } + +// Test an instance where the result type is not a prototyped function, such that we still get a diagnostic. +typedef void (*nonproto_n_ptrty) (); +// expected-cpp-error@+2 {{cannot initialize return object of type 'nonproto_n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __arm_streaming')}} +// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __arm_streaming') from a function with result type 'nonproto_n_ptrty' (aka 'void (*)()')}} +nonproto_n_ptrty return_invalid_fptr_streaming_nonprotonormal(s_ptrty f) { return f; } + +typedef void (*sc_ptrty) (void) __arm_streaming_compatible; +sc_ptrty return_valid_streaming_compatible_fptr(sc_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 'sc_ptrty' (aka 'void (*)() __arm_streaming_compatible') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible')}} +sc_ptrty return_invalid_fptr_streaming_compatible_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sc_ptrty' (aka 'void (*)() __arm_streaming_compatible')}} +// expected-error@+1 {{incompatible function pointer types returning 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_streaming_compatible(sc_ptrty f) { return f; } + +typedef void (*sz_ptrty) (void) __arm_shared_za; +sz_ptrty return_valid_shared_za_fptr(sz_ptrty f) { return f; } + + +// expected-cpp-error@+2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za')}} +sz_ptrty return_invalid_fptr_shared_za_normal(n_ptrty f) { return f; } +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za')}} +// expected-error@+1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +n_ptrty return_invalid_fptr_normal_shared_za(sz_ptrty f) { return f; } + +typedef void (*pz_ptrty) (void) __arm_preserves_za; +pz_ptrty return_valid_preserves_za_fptr(pz_ptrty f) { return f; } + +// expected-cpp-error@+2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves_za')}} +pz_ptrty return_invalid_fptr_preserves_za_normal(n_ptrty f) { return f; } +// No diagnostics, the preserves_za hint should be dropped silently. +n_ptrty return_invalid_fptr_normal_preserves_za(pz_ptrty f) { return f; } + +// Test template instantiations +#ifdef __cplusplus +template T templated(T x) __arm_streaming { return x; } +template <> int templated(int x) __arm_streaming { return x + 1; } +template <> float templated(float x) __arm_streaming { return x + 2; } +// expected-cpp-error@+2 {{explicit instantiation of 'templated' does not refer to a function template, variable template, member function, member class, or static data member}} +// expected-cpp-note@-4 {{candidate template ignored: could not match 'short (short) __arm_streaming' against 'short (short)'}} +template short templated(short); +#endif + +// Conflicting attributes on redeclarations + +// expected-error@+5 {{function declared 'void (void) __arm_streaming_compatible' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}} +// expected-note@+3 {{previous declaration is here}} +// expected-cpp-error@+3 {{function declared 'void () __arm_streaming_compatible' was previously declared 'void () __arm_streaming', which has different SME function attributes}} +// expected-cpp-note@+1 {{previous declaration is here}} +void redecl(void) __arm_streaming; +void redecl(void) __arm_streaming_compatible { } + +// expected-error@+5 {{function declared 'void (void) __arm_shared_za' was previously declared 'void (void) __arm_shared_za __arm_preserves_za', which has different SME function attributes}} +// expected-note@+3 {{previous declaration is here}} +// expected-cpp-error@+3 {{function declared 'void () __arm_shared_za' was previously declared 'void () __arm_shared_za __arm_preserves_za', which has different SME function attributes}} +// expected-cpp-note@+1 {{previous declaration is here}} +void redecl_preserve_za(void) __arm_shared_za __arm_preserves_za;; +void redecl_preserve_za(void) __arm_shared_za {} + +// expected-error@+5 {{function declared 'void (void) __arm_shared_za __arm_preserves_za' was previously declared 'void (void) __arm_shared_za', which has different SME function attributes}} +// expected-note@+3 {{previous declaration is here}} +// expected-cpp-error@+3 {{function declared 'void () __arm_shared_za __arm_preserves_za' was previously declared 'void () __arm_shared_za', which has different SME function attributes}} +// expected-cpp-note@+1 {{previous declaration is here}} +void redecl_nopreserve_za(void) __arm_shared_za; +void redecl_nopreserve_za(void) __arm_shared_za __arm_preserves_za {} + +#ifdef __cplusplus +struct S { + virtual void shared_za_memberfn(void) __arm_shared_za; +}; + +struct S2 : public S { +// expected-cpp-error@+2 {{virtual function 'shared_za_memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_shared_za')}} +// expected-cpp-note@-5 {{overridden virtual function is here}} + __arm_new_za void shared_za_memberfn(void) override {} +}; + +// The '__arm_preserves_za' property cannot be dropped when overriding a virtual +// function. It is however fine for the overriding function to be '__arm_preserves_za' +// even though the function that it overrides is not. + +struct S_PreservesZA { + virtual void memberfn(void) __arm_preserves_za; +}; + +struct S_Drop_PreservesZA : S_PreservesZA { +// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves_za')}} +// expected-cpp-note@-5 {{overridden virtual function is here}} + void memberfn(void) override {} +}; + +struct S_NoPreservesZA { + virtual void memberfn(void); +}; +struct S_AddPreservesZA : S_NoPreservesZA { +// This is fine, the overridden function just adds more guarantees. + void memberfn(void) __arm_preserves_za override {} +}; + + +// Check that the attribute propagates through template instantiations. +template +struct S3 { + static constexpr int value = 0; +}; + +template <> +struct S3 { + static constexpr int value = 1; +}; + +template <> +struct S3 { + static constexpr int value = 2; +}; + +template <> +struct S3 { + static constexpr int value = 4; +}; + +template <> +struct S3 { + static constexpr int value = 8; +}; + +template <> +struct S3 { + static constexpr int value = 16; +}; + +void normal_func(void) {} +void streaming_func(void) __arm_streaming {} +void streaming_compatible_func(void) __arm_streaming_compatible {} +void shared_za_func(void) __arm_shared_za {} +void preserves_za_func(void) __arm_preserves_za {} + +static_assert(S3::value == 1, "why are we picking the wrong specialization?"); +static_assert(S3::value == 2, "why are we picking the wrong specialization?"); +static_assert(S3::value == 4, "why are we picking the wrong specialization?"); +static_assert(S3::value == 8, "why are we picking the wrong specialization?"); +static_assert(S3::value == 16, "why are we picking the wrong specialization?"); + +// Also test the attribute is propagated with variadic templates +constexpr int eval_variadic_template() { return 0; } +template +constexpr int eval_variadic_template(T f, Other... other) { + return S3::value + eval_variadic_template(other...); +} +static_assert(eval_variadic_template(normal_func, streaming_func, + streaming_compatible_func, + shared_za_func, preserves_za_func) == 31, + "attributes not propagated properly in variadic template"); + +// Test that the attribute is propagated with template specialization. +template int test_templated_f(T); +template<> constexpr int test_templated_f(void(*)(void)) { return 1; } +template<> constexpr int test_templated_f(void(*)(void)__arm_streaming) { return 2; } +template<> constexpr int test_templated_f(void(*)(void)__arm_streaming_compatible) { return 4; } +template<> constexpr int test_templated_f(void(*)(void)__arm_shared_za) { return 8; } +template<> constexpr int test_templated_f(void(*)(void)__arm_preserves_za) { return 16; } + +static_assert(test_templated_f(&normal_func) == 1, "Instantiated to wrong function"); +static_assert(test_templated_f(&streaming_func) == 2, "Instantiated to wrong function"); +static_assert(test_templated_f(&streaming_compatible_func) == 4, "Instantiated to wrong function"); +static_assert(test_templated_f(&shared_za_func) == 8, "Instantiated to wrong function"); +static_assert(test_templated_f(&preserves_za_func) == 16, "Instantiated to wrong function"); + +// expected-cpp-error@+2 {{'__arm_streaming' only applies to function types; type here is 'int'}} +// expected-error@+1 {{'__arm_streaming' only applies to function types; type here is 'int'}} +int invalid_type_for_attribute __arm_streaming; + +// Test overloads +constexpr int overload(void f(void)) { return 1; } +constexpr int overload(void f(void) __arm_streaming) { return 2; } +constexpr int overload(void f(void) __arm_streaming_compatible) { return 4; } +constexpr int overload(void f(void) __arm_shared_za) { return 8; } +constexpr int overload(void f(void) __arm_preserves_za) { return 16; } +static_assert(overload(&normal_func) == 1, "Overloaded to wrong function"); +static_assert(overload(&streaming_func) == 2, "Overloaded to wrong function"); +static_assert(overload(&streaming_compatible_func) == 4, "Overloaded to wrong function"); +static_assert(overload(&shared_za_func) == 8, "Overloaded to wrong function"); +static_assert(overload(&preserves_za_func) == 16, "Overloaded to wrong function"); + +// Test implicit instantiation +template struct X { + static void foo(T) __arm_streaming { } +}; +constexpr int overload_int(void f(int)) { return 1; } +constexpr int overload_int(void f(int) __arm_streaming) { return 2; } +constexpr X *ptr = 0; +static_assert(overload_int(ptr->foo) == 2, "Overloaded to the wrong function after implicit instantiation"); + +#endif // ifdef __cplusplus -- Gitee From a0fe7e1f4710bd749b20edbc08897e0a174c8403 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 31 Aug 2023 13:28:48 +0000 Subject: [PATCH 07/77] [AArch64][ISel] NFC: Change streaming mode only through 'changeStreamingMode' This simplifies the code and unifies code-paths to use a single function for emitting streaming-mode changes. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../lib/Target/AArch64/AArch64ISelLowering.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0cc5e7fc5cc3..5109822101ad 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6579,12 +6579,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // make sure it is Glued to the last CopyFromReg value. if (IsLocallyStreaming) { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); - Chain = DAG.getNode( - AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), - {DAG.getRoot(), - DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), - DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); + Chain = + changeStreamingMode(DAG, DL, /*Enable*/ true, DAG.getRoot(), Glue, + DAG.getConstant(0, DL, MVT::i64), /*Entry*/ true); + // Ensure that the SMSTART happens after the CopyWithChain such that its // chain result is used. for (unsigned I=0; IgetSMStartStopCallPreservedMask())); + Chain = changeStreamingMode( + DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), + DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); Glue = Chain.getValue(1); } -- Gitee From 87f115176d2c9be15d98bd7a5ff208b8ce84f569 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 1 Sep 2023 12:12:40 +0000 Subject: [PATCH 08/77] [AArch64][SME] Disable remat of VL-dependent ops when function changes streaming mode. This is a way to prevent the register allocator from inserting instructions which behave differently for different runtime vector-lengths, inside a call-sequence which changes the streaming-SVE mode before/after the call. I've considered using BUNDLEs in Machine IR, but found that using this is not possible for a few reasons: * Most passes don't look inside BUNDLEs, but some passes would need to look inside these call-sequence bundles, for example the PrologEpilog pass (to remove the CALLSEQSTART/END), a PostRA pass to remove COPY instructions, or the AArch64PseudoExpand pass. * Within the streaming-mode-changing call sequence, one of the instructions is a CALLSEQEND. The corresponding CALLSEQBEGIN (AArch64::ADJCALLSTACKUP) is outside this sequence. This means we'd end up with a BUNDLE that has [SMSTART, COPY, BL, ADJCALLSTACKUP, COPY, SMSTOP]. The MachineVerifier doesn't accept this, and we also can't move the CALLSEQSTART into the call sequence. Maybe in the future we could model this differently by modelling the runtime vector-length as a value that's used by certain operations (similar to e.g. NCZV flags) and clobbered by SMSTART/MMSTOP, such that the register allocator can consider these as actual dependences and avoid rematerialization. For now we just want to address the immediate problem. Reviewed By: paulwalker-arm, aemerson Differential Revision: https://reviews.llvm.org/D159193 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 ++ .../AArch64/AArch64MachineFunctionInfo.h | 7 ++ ...materialize-with-streaming-mode-changes.ll | 71 +++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5109822101ad..ddcb84566672 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7110,6 +7110,10 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { SDValue AArch64TargetLowering::changeStreamingMode( SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setHasStreamingModeChanges(true); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); SDValue MSROp = diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d82fb436925e..8df95ff1e6ea 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -185,6 +185,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// The frame-index for the TPIDR2 object used for lazy saves. Register LazySaveTPIDR2Obj = 0; + /// Whether this function changes streaming mode within the function. + bool HasStreamingModeChanges = false; /// True if the function need unwind information. mutable std::optional NeedsDwarfUnwindInfo; @@ -447,6 +449,11 @@ public: bool needsDwarfUnwindInfo(const MachineFunction &MF) const; bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const; + bool hasStreamingModeChanges() const { return HasStreamingModeChanges; } + void setHasStreamingModeChanges(bool HasChanges) { + HasStreamingModeChanges = HasChanges; + } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll new file mode 100644 index 000000000000..b3aeb1fcc42d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + + +define void @dont_rematerialize_cntd(i32 %N) #0 { +; CHECK-LABEL: dont_rematerialize_cntd: +; CHECK: cntd +; CHECK: smstop sm +; CHECK-NOT: cntd +; CHECK: bl foo +; CHECK: smstart sm +entry: + %cmp2 = icmp sgt i32 %N, 0 + br i1 %cmp2, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind + %.tr = call i32 @llvm.vscale.i32() + %conv = shl nuw nsw i32 %.tr, 4 + call void @foo(i32 %conv) + %inc = add nuw nsw i32 %index.03, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void +} + +; This test doesn't strictly make sense, because it passes a scalable predicate +; to a function, which makes little sense if the VL is not the same in/out of +; streaming-SVE mode. If the VL is known to be the same, then we could just as +; well rematerialize the `ptrue` inside the call sequence. However, the purpose +; of this test is more to ensure that the logic works, which may also trigger +; when the value is not being passed as argument (e.g. when it is hoisted from +; a loop and placed inside the call sequence). +; +; FIXME: This test also exposes another bug, where the 'mul vl' addressing mode +; is used before/after the smstop. This will be fixed in a future patch. +define void @dont_rematerialize_ptrue(i32 %N) #0 { +; CHECK-LABEL: dont_rematerialize_ptrue: +; CHECK: ptrue [[PTRUE:p[0-9]+]].b +; CHECK: str [[PTRUE]], [[[SPILL_ADDR:.*]]] +; CHECK: smstop sm +; CHECK: ldr p0, [[[SPILL_ADDR]]] +; CHECK-NOT: ptrue +; CHECK: bl bar +; CHECK: smstart sm +entry: + %cmp2 = icmp sgt i32 %N, 0 + br i1 %cmp2, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind + %ptrue.ins = insertelement poison, i1 1, i32 0 + %ptrue = shufflevector %ptrue.ins, poison, zeroinitializer + call void @bar( %ptrue) + %inc = add nuw nsw i32 %index.03, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void +} +declare void @foo(i32) +declare void @bar() +declare i32 @llvm.vscale.i32() + +attributes #0 = { "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+sme,+sve" } -- Gitee From b0890e854c33ad237641a1b2101a3c506883f516 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 1 Sep 2023 12:18:59 +0000 Subject: [PATCH 09/77] [AArch64][SME] Don't use OBSCURE_COPY to avoid rematerialization. This is intended to be a non-functional change. This patch removes OBSCURE_COPY in favour of using `forceDisableTriviallyReMaterializable`. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D159194 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 11 ----------- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ------ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 ------- llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 ------ 4 files changed, 30 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index dcb73ae2dce2..0b12f5515a1e 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1481,17 +1481,6 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } - case AArch64::OBSCURE_COPY: { - if (MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .add(MI.getOperand(1)) - .addImm(0); - } - MI.eraseFromParent(); - return true; - } } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ddcb84566672..c4af0a70c362 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2276,7 +2276,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; - MAKE_CASE(AArch64ISD::OBSCURE_COPY) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) @@ -7458,11 +7457,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return ArgReg.Reg == VA.getLocReg(); }); } else { - // Add an extra level of indirection for streaming mode changes by - // using a pseudo copy node that cannot be rematerialised between a - // smstart/smstop and the call by the simple register coalescer. - if (RequiresSMChange && isa(Arg)) - Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg); RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index aca45f113e73..a6771921eada 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,13 +58,6 @@ enum NodeType : unsigned { CALL_BTI, // Function call followed by a BTI instruction. - // Essentially like a normal COPY that works on GPRs, but cannot be - // rematerialised by passes like the simple register coalescer. It's - // required for SME when lowering calls because we cannot allow frame - // index calculations using addvl to slip in between the smstart/smstop - // and the bl instruction. The scalable vector length may change across - // the smstart/smstop boundary. - OBSCURE_COPY, SMSTART, SMSTOP, RESTORE_ZA, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 352f34cab132..f306021dd753 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -23,8 +23,6 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3, [SDNPHasChain, SDNPSideEffect, SDNPVariadic, SDNPOptInGlue]>; -def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>; - //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// @@ -185,10 +183,6 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), (MSR 0xde85, GPR64:$val)>; def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; - -def OBSCURE_COPY : Pseudo<(outs GPR64:$dst), (ins GPR64:$idx), []>, Sched<[]> { } -def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))), - (OBSCURE_COPY GPR64:$idx)>; } // End let Predicates = [HasSME] // Pseudo to match to smstart/smstop. This expands: -- Gitee From 6a1704c10f73890d1c1365fa1c475b8c35013dd8 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 1 Sep 2023 14:34:13 +0000 Subject: [PATCH 10/77] [SME] Don't scavenge a spillslot in callee-save area in presence of streaming-mode changes. If no frame-pointer is available and the compiler has scavenged a spill-slot in the callee-save area, the compiler may be forced to emit an 'addvl' inside the streaming-mode-changing call sequence when it needs to fill (reload) an FP register being passed to the call. We can avoid this entirely by disabling stack-slot scavenging when there are streaming-mode-changing call-sequences in the function. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D159196 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64FrameLowering.cpp | 7 ++ .../AArch64/sme-disable-gisel-fisel.ll | 68 +++++++++---------- .../CodeGen/AArch64/sme-streaming-body.ll | 32 +++++---- .../AArch64/sme-streaming-interface.ll | 13 ++-- ...nging-call-disable-stackslot-scavenging.ll | 49 +++++++++++++ 5 files changed, 113 insertions(+), 56 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 4d5676f34101..83af97f8c8c7 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -427,6 +427,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + // Win64 EH requires a frame pointer if funclets are present, as the locals // are accessed off the frame pointer in both the parent function and the // funclets. @@ -3252,6 +3253,12 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( bool AArch64FrameLowering::enableStackSlotScavenging( const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); + // If the function has streaming-mode changes, don't scavenge a + // spillslot in the callee-save area, as that might require an + // 'addvl' in the streaming-mode-changing call-sequence when the + // function doesn't use a FP. + if (AFI->hasStreamingModeChanges() && !hasFP(MF)) + return false; return AFI->hasCalleeSaveStackFreeSpace(); } diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 0118d7df2564..979ac24e3869 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -17,15 +17,15 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstart sm -; CHECK-FISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: bl streaming_callee -; CHECK-FISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstop sm ; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0 ; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload +; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: fadd d0, d1, d0 ; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -43,15 +43,15 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstart sm -; CHECK-GISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: bl streaming_callee -; CHECK-GISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstop sm ; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 ; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: fadd d0, d1, d0 ; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -76,15 +76,15 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstop sm -; CHECK-FISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: bl normal_callee -; CHECK-FISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstart sm ; CHECK-FISEL-NEXT: adrp x8, .LCPI1_0 ; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI1_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload +; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: fadd d0, d1, d0 ; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -102,15 +102,15 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstop sm -; CHECK-GISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: bl normal_callee -; CHECK-GISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstart sm ; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 ; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: fadd d0, d1, d0 ; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -128,32 +128,32 @@ entry: define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" { ; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: sub sp, sp, #96 -; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #112 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl normal_callee -; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 ; CHECK-COMMON-NEXT: fmov d0, x8 -; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: fadd d0, d1, d0 -; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #112 ; CHECK-COMMON-NEXT: ret %call = call double @normal_callee(double %x); %add = fadd double %call, 4.200000e+01 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 7561e75d04ef..c30e991d8f2f 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; @@ -237,25 +237,27 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: %0 = call fast double @llvm.cos.f64(double %x) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 2f4c907c8724..a338db9f8138 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s ; This file tests the following combinations related to streaming-enabled functions: ; [ ] N -> S (Normal -> Streaming) @@ -313,20 +313,19 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload -; CHECK-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll new file mode 100644 index 000000000000..e4cd4d6c05c5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +; This function would normally scavenge a stackslot from the callee-save +; area, which would lead to spilling 's0' to that stackslot before the +; smstop and filling it with 'addvl + ' after the smstop because +; the frame-pointer is not available. +; This would not be valid, since the vector-length has changed so 'addvl' +; cannot be used. This is testing that the stackslot-scavenging is disabled +; when there are any streaming-mode-changing call-sequences in the +; function. +define void @test_no_stackslot_scavenging(float %f) #0 { +; CHECK-LABEL: test_no_stackslot_scavenging: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f +; CHECK-NEXT: smstart sm +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ptr = alloca + call void asm sideeffect "", "~{x24}"() nounwind + call void @use_f(float %f) + ret void +} + +declare void @use_f(float) + +attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" } -- Gitee From ee58403283304b7c321aef8bc466e58ec5a1faf8 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 9 Aug 2023 12:31:44 +0000 Subject: [PATCH 11/77] [Clang][AArch64] Add diagnostic for calls from non-ZA to shared-ZA functions. The caller is required to have ZA state if it wants to share it with a callee. Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D157270 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../clang/Basic/DiagnosticSemaKinds.td | 2 ++ clang/lib/Sema/SemaChecking.cpp | 16 +++++++++++++ .../aarch64-sme-attrs.cpp | 4 ++-- clang/test/Sema/aarch64-sme-func-attrs.c | 24 +++++++++++++++++++ 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a174daabc8cd..a1f72cf14cbe 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3629,6 +3629,8 @@ def err_sme_attr_mismatch : Error< "function declared %0 was previously declared %1, which has different SME function attributes">; def err_sme_call_in_non_sme_target : Error< "call to a streaming function requires 'sme'">; +def err_sme_za_call_no_za_state : Error< + "call to a shared ZA function requires the caller to have ZA state">; def err_sme_definition_using_sm_in_non_sme_target : Error< "function executed in streaming-SVE mode requires 'sme'">; def err_sme_definition_using_za_in_non_sme_target : Error< diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 0af822a3d709..d9fe73ddfb71 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -7149,6 +7149,22 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, Diag(Loc, diag::err_sme_call_in_non_sme_target); } } + + // If the callee uses AArch64 SME ZA state but the caller doesn't define + // any, then this is an error. + if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask) { + bool CallerHasZAState = false; + if (const auto *CallerFD = dyn_cast(CurContext)) { + if (CallerFD->hasAttr()) + CallerHasZAState = true; + else if (const auto *FPT = CallerFD->getType()->getAs()) + CallerHasZAState = FPT->getExtProtoInfo().AArch64SMEAttributes & + FunctionType::SME_PStateZASharedMask; + } + + if (!CallerHasZAState) + Diag(Loc, diag::err_sme_za_call_no_za_state); + } } if (FDecl && FDecl->hasAttr()) { diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp index 52937495484d..0768bfc33238 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -255,7 +255,7 @@ int call() { return 0; } template __attribute__((always_inline)) -int call(T f, Other... other) { +int call(T f, Other... other) __arm_shared_za { return f() + call(other...); } @@ -270,7 +270,7 @@ int call(T f, Other... other) { // CHECK-NEXT: add nsw // CHECK-NEXT: add nsw // CHECK-NEXT: ret -int test_variadic_template() { +int test_variadic_template() __arm_shared_za { return call(normal_callee, streaming_decl, streaming_compatible_decl, diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c index 71140a9ebc61..73c0934d689e 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs.c +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -178,7 +178,31 @@ void redecl_preserve_za(void) __arm_shared_za {} void redecl_nopreserve_za(void) __arm_shared_za; void redecl_nopreserve_za(void) __arm_shared_za __arm_preserves_za {} +void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) { + sme_arm_new_za(); // OK + // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}} + // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} + sme_arm_shared_za(); + // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}} + // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} + shared_za_fn_ptr(); +} + +void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) __arm_shared_za { + sme_arm_shared_za(); // OK + shared_za_fn_ptr(); // OK +} + +__arm_new_za void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) { + sme_arm_shared_za(); // OK + shared_za_fn_ptr(); // OK +} + #ifdef __cplusplus +int shared_za_initializer(void) __arm_shared_za; +// expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} +int global = shared_za_initializer(); + struct S { virtual void shared_za_memberfn(void) __arm_shared_za; }; -- Gitee From 6db31271dc85104c71bfab040952521409413af4 Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Wed, 9 Aug 2023 15:37:51 +0000 Subject: [PATCH 12/77] [AArch64][SVE] Add asm predicate constraint Uph Some instructions such as multi-vector LD1 only accept a range of PN8-PN15 predicate-as-counter. This new constraint allows more refined parsing and better decision making when parsing these instructions from ASM, instead of defaulting to Upa which incorrectly uses the whole range of registers P0-P15 from the register class PPR. Differential Revision: https://reviews.llvm.org/D157517 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/Basic/Targets/AArch64.cpp | 5 ++- .../aarch64-sve-inline-asm-datatypes.c | 24 +++++++++++ llvm/docs/LangRef.rst | 3 +- .../Target/AArch64/AArch64ISelLowering.cpp | 43 +++++++++++-------- llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 11 +++++ 5 files changed, 65 insertions(+), 21 deletions(-) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 0889262b0c9c..6d22b65df3da 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1302,8 +1302,9 @@ bool AArch64TargetInfo::validateAsmConstraint( Info.setAllowsRegister(); return true; case 'U': - if (Name[1] == 'p' && (Name[2] == 'l' || Name[2] == 'a')) { - // SVE predicate registers ("Upa"=P0-15, "Upl"=P0-P7) + if (Name[1] == 'p' && + (Name[2] == 'l' || Name[2] == 'a' || Name[2] == 'h')) { + // SVE predicate registers ("Upa"=P0-15, "Upl"=P0-P7, "Uph"=P8-P15) Info.setAllowsRegister(); Name += 2; return true; diff --git a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c index 5c1e931a7271..14a29dfac2c7 100644 --- a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c +++ b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c @@ -168,6 +168,30 @@ SVBOOL_TEST_UPL(__SVInt32_t, s) ; SVBOOL_TEST_UPL(__SVInt64_t, d) ; // CHECK: call asm sideeffect "fadd $0.d, $1.d, $2.d, $3.d\0A", "=w,@3Upl,w,w"( %in1, %in2, %in3) +#define SVBOOL_TEST_UPH(DT, KIND)\ +__SVBool_t func_bool_uph_##KIND(__SVBool_t in1, DT in2, DT in3)\ +{\ + __SVBool_t out;\ + asm volatile (\ + "fadd %[out]." #KIND ", %[in1]." #KIND ", %[in2]." #KIND ", %[in3]." #KIND "\n"\ + : [out] "=w" (out)\ + : [in1] "Uph" (in1),\ + [in2] "w" (in2),\ + [in3] "w" (in3)\ + :);\ + return out;\ +} + +SVBOOL_TEST_UPH(__SVInt8_t, b) ; +// CHECK: call asm sideeffect "fadd $0.b, $1.b, $2.b, $3.b\0A", "=w,@3Uph,w,w"( %in1, %in2, %in3) +SVBOOL_TEST_UPH(__SVInt16_t, h) ; +// CHECK: call asm sideeffect "fadd $0.h, $1.h, $2.h, $3.h\0A", "=w,@3Uph,w,w"( %in1, %in2, %in3) +SVBOOL_TEST_UPH(__SVInt32_t, s) ; +// CHECK: call asm sideeffect "fadd $0.s, $1.s, $2.s, $3.s\0A", "=w,@3Uph,w,w"( %in1, %in2, %in3) +SVBOOL_TEST_UPH(__SVInt64_t, d) ; +// CHECK: call asm sideeffect "fadd $0.d, $1.d, $2.d, $3.d\0A", "=w,@3Uph,w,w"( %in1, %in2, %in3) + + #define SVFLOAT_TEST(DT,KIND)\ DT func_float_##DT##KIND(DT inout1, DT in2)\ {\ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 701cd6e5dbb6..4fd47bb2bbda 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4994,7 +4994,8 @@ AArch64: - ``w``: A 32, 64, or 128-bit floating-point, SIMD or SVE vector register. - ``x``: Like w, but restricted to registers 0 to 15 inclusive. - ``y``: Like w, but restricted to SVE vector registers Z0 to Z7 inclusive. -- ``Upl``: One of the low eight SVE predicate registers (P0 to P7) +- ``Uph``: One of the upper eight SVE predicate registers (P8 to P15) +- ``Upl``: One of the lower eight SVE predicate registers (P0 to P7) - ``Upa``: Any of the SVE predicate registers (P0 to P15) AMDGPU: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c4af0a70c362..a9ec964fe120 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9944,19 +9944,31 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { return "r"; } -enum PredicateConstraint { - Upl, - Upa, - Invalid -}; +enum PredicateConstraint { Uph, Upl, Upa, Invalid }; static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { - PredicateConstraint P = PredicateConstraint::Invalid; - if (Constraint == "Upa") - P = PredicateConstraint::Upa; - if (Constraint == "Upl") - P = PredicateConstraint::Upl; - return P; + return StringSwitch(Constraint) + .Case("Uph", PredicateConstraint::Uph) + .Case("Upl", PredicateConstraint::Upl) + .Case("Upa", PredicateConstraint::Upa) + .Default(PredicateConstraint::Invalid); +} + +static const TargetRegisterClass * +getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { + if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + return nullptr; + + switch (Constraint) { + default: + return nullptr; + case PredicateConstraint::Uph: + return &AArch64::PPR_p8to15RegClass; + case PredicateConstraint::Upl: + return &AArch64::PPR_3bRegClass; + case PredicateConstraint::Upa: + return &AArch64::PPRRegClass; + } } // The set of cc code supported is from @@ -10148,13 +10160,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( } } else { PredicateConstraint PC = parsePredicateConstraint(Constraint); - if (PC != PredicateConstraint::Invalid) { - if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) - return std::make_pair(0U, nullptr); - bool restricted = (PC == PredicateConstraint::Upl); - return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) - : std::make_pair(0U, &AArch64::PPRRegClass); - } + if (const TargetRegisterClass *RegClass = getPredicateRegisterClass(PC, VT)) + return std::make_pair(0U, RegClass); } if (StringRef("{cc}").equals_insensitive(Constraint) || parseConstraintCode(Constraint) != AArch64CC::Invalid) diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll index ad1093028c1a..2f1e2ad5e2fd 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll @@ -68,3 +68,14 @@ define @test_incp( %Pg, %1 = tail call asm "incp $0.s, $1", "=w,@3Upa,0"( %Pg, %Zn) ret %1 } + +; Function Attrs: nounwind readnone +; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 +; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 +; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0 +; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]] +; CHECK: INLINEASM {{.*}} [[ARG4]] +define @test_svfadd_f16_Uph_constraint( %Pg, %Zn, %Zm) { + %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"( %Pg, %Zn, %Zm) + ret %1 +} -- Gitee From 8966611b1fa05440933f6645461f546a179ebf40 Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Mon, 14 Aug 2023 13:49:57 +0000 Subject: [PATCH 13/77] [AArch64][SME] Non-streaming compatible SCVTF emitted with --force-streaming-compatible-sve For scalar integer to float converts for Streaming Compatible SVE use non-NEON version of convert instrction. Differential Revision: https://reviews.llvm.org/D157698 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 +- ...e-streaming-mode-fixed-length-int-to-fp.ll | 198 ++++++++++++++++++ 2 files changed, 200 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a9ec964fe120..ef04d9ceb72a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16556,7 +16556,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); - if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) && + N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 0bd767cd4365..4ae4e6538703 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -1122,3 +1122,201 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { store <4 x double> %res, ptr %b ret void } + +define half @scvtf_i16_f16(ptr %0) { +; CHECK-LABEL: scvtf_i16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to half + ret half %3 +} + +define float @scvtf_i16_f32(ptr %0) { +; CHECK-LABEL: scvtf_i16_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to float + ret float %3 +} + +define double @scvtf_i16_f64(ptr %0) { +; CHECK-LABEL: scvtf_i16_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to double + ret double %3 +} + +define half @scvtf_i32_f16(ptr %0) { +; CHECK-LABEL: scvtf_i32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to half + ret half %3 +} + +define float @scvtf_i32_f32(ptr %0) { +; CHECK-LABEL: scvtf_i32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to float + ret float %3 +} + +define double @scvtf_i32_f64(ptr %0) { +; CHECK-LABEL: scvtf_i32_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to double + ret double %3 +} + +define half @scvtf_i64_f16(ptr %0) { +; CHECK-LABEL: scvtf_i64_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf h0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to half + ret half %3 +} + +define float @scvtf_i64_f32(ptr %0) { +; CHECK-LABEL: scvtf_i64_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to float + ret float %3 +} + +define double @scvtf_i64_f64(ptr %0) { +; CHECK-LABEL: scvtf_i64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to double + ret double %3 +} + +define half @ucvtf_i16_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to half + ret half %3 +} + +define float @ucvtf_i16_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ucvtf s0, s0 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to float + ret float %3 +} + +define double @ucvtf_i16_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to double + ret double %3 +} + +define half @ucvtf_i32_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to half + ret half %3 +} + +define float @ucvtf_i32_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to float + ret float %3 +} + +define double @ucvtf_i32_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to double + ret double %3 +} + +define half @ucvtf_i64_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf h0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to half + ret half %3 +} + +define float @ucvtf_i64_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to float + ret float %3 +} + +define double @ucvtf_i64_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf d0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to double + ret double %3 +} -- Gitee From 012ea857819cc7a76dd1350a0512c7025326c22b Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Sat, 10 Jul 2021 15:52:54 +0200 Subject: [PATCH 14/77] Complete the implementation of P2361 Unevaluated string literals The attributes changes were left out of Clang 17. Attributes that used to take a string literal now accept an unevaluated string literal instead, which means they reject numeric escape sequences and strings literal with an encoding prefix - but the later was already ill-formed in most cases. We need to know that we are going to parse an unevaluated string literal before we do - so we can reject numeric escape sequence, so we derive from Attrs.td which attributes parameters are expected to be string literals. Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D156237 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/docs/ReleaseNotes.rst | 3 + .../clang/Basic/DiagnosticCommonKinds.td | 4 +- clang/include/clang/Parse/Parser.h | 7 ++ clang/include/clang/Sema/ParsedAttr.h | 15 +++ clang/lib/Parse/ParseDecl.cpp | 91 ++++++++++++++++++- clang/lib/Parse/ParseExpr.cpp | 2 +- clang/lib/Sema/SemaDeclAttr.cpp | 12 ++- clang/test/Parser/MicrosoftExtensions.cpp | 4 +- clang/test/Parser/c2x-attributes.c | 9 +- clang/test/Parser/cxx-attributes.cpp | 2 +- clang/test/Parser/cxx0x-attributes.cpp | 10 +- clang/test/Sema/MicrosoftExtensions.c | 2 +- clang/test/Sema/annotate-type.c | 4 +- clang/test/Sema/annotate.c | 4 +- clang/test/Sema/attr-assume.c | 8 +- clang/test/Sema/attr-btf_tag.c | 2 +- clang/test/Sema/attr-btf_type_tag.c | 2 +- clang/test/Sema/attr-capabilities.c | 4 +- clang/test/Sema/attr-enforce-tcb-errors.cpp | 4 +- clang/test/Sema/attr-enforce-tcb-errors.m | 4 +- clang/test/Sema/attr-error.c | 2 +- clang/test/Sema/attr-handles.cpp | 2 +- clang/test/Sema/attr-section.c | 2 +- clang/test/Sema/attr-tls_model.c | 2 +- clang/test/Sema/attr-unavailable-message.c | 2 +- clang/test/Sema/attr-warning.c | 2 +- clang/test/Sema/diagnose_if.c | 2 +- clang/test/Sema/enable_if.c | 2 +- .../attr-deprecated-replacement-error.cpp | 8 +- clang/test/SemaCXX/attr-no-sanitize.cpp | 2 +- clang/test/SemaCXX/attr-section.cpp | 2 +- clang/test/SemaCXX/attr-weakref.cpp | 2 +- clang/test/SemaCXX/suppress.cpp | 2 +- clang/test/SemaObjC/attr-swift_bridge.m | 2 +- .../SemaObjC/objc-asm-attribute-neg-test.m | 4 +- .../test/SemaObjC/validate-attr-swift_attr.m | 2 +- clang/test/SemaTemplate/attributes.cpp | 44 +++------ clang/utils/TableGen/ClangAttrEmitter.cpp | 45 +++++++++ clang/www/cxx_status.html | 7 +- 39 files changed, 241 insertions(+), 88 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 05dad41c07fa..7b199ffd0397 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -160,6 +160,9 @@ Resolutions to C++ Defect Reports ``-Wdeprecated-literal-operator`` for the latter, off by default for now. .. code-block:: c++ +- Attributes now expect unevaluated strings in attributes parameters that are string literals. + This is applied to both C++ standard attributes, and other attributes supported by Clang. + This completes the implementation of `P2361R6 Unevaluated Strings _` // What follows is warned by -Wuser-defined-literals // albeit "ill-formed, no diagnostic required". diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index d7cc3cff0797..d35445db231e 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -56,7 +56,9 @@ def err_expected_string_literal : Error<"expected string literal " "%select{in %1|for diagnostic message in static_assert|" "for optional message in 'availability' attribute|" "for %select{language name|source container name|USR}1 in " - "'external_source_symbol' attribute}0">; + "'external_source_symbol' attribute|" + "as argument of '%1' attribute}0">; + def err_invalid_string_udl : Error< "string literal with user-defined suffix cannot be used here">; def err_invalid_character_udl : Error< diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 475dfe845528..43e3a9d4d398 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -2790,6 +2790,13 @@ private: /// clang accepts as an extension. void DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs); + ExprResult ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName); + + bool + ParseAttributeArgumentList(const clang::IdentifierInfo &AttrName, + SmallVectorImpl &Exprs, + ParsedAttributeArgumentsProperties ArgsProperties); + /// Parses syntax-generic attribute arguments for attributes which are /// known to the implementation, and adds them to the given ParsedAttributes /// list with the given attribute syntax. Returns the number of arguments diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 592580bccd23..8c0edca1ebc5 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/VersionTuple.h" +#include #include #include #include @@ -911,6 +912,20 @@ private: VecTy AttrList; }; +struct ParsedAttributeArgumentsProperties { + ParsedAttributeArgumentsProperties(uint32_t StringLiteralBits) + : StringLiterals(StringLiteralBits) {} + bool isStringLiteralArg(unsigned I) const { + // If the last bit is set, assume we have a variadic parameter + if (I >= StringLiterals.size()) + return StringLiterals.test(StringLiterals.size() - 1); + return StringLiterals.test(I); + } + +private: + std::bitset<32> StringLiterals; +}; + /// ParsedAttributes - A collection of parsed attributes. Currently /// we don't differentiate between the various attribute syntaxes, /// which is basically silly. diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index cf1e3a94de7f..97cd51359b62 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -288,6 +288,16 @@ static bool attributeHasIdentifierArg(const IdentifierInfo &II) { #undef CLANG_ATTR_IDENTIFIER_ARG_LIST } +/// Determine whether the given attribute has an identifier argument. +static ParsedAttributeArgumentsProperties +attributeStringLiteralListArg(const IdentifierInfo &II) { +#define CLANG_ATTR_STRING_LITERAL_ARG_LIST + return llvm::StringSwitch(normalizeAttrName(II.getName())) +#include "clang/Parse/AttrParserStringSwitches.inc" + .Default(0); +#undef CLANG_ATTR_STRING_LITERAL_ARG_LIST +} + /// Determine whether the given attribute has a variadic identifier argument. static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II) { #define CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST @@ -371,6 +381,81 @@ void Parser::ParseAttributeWithTypeArg(IdentifierInfo &AttrName, ScopeName, ScopeLoc, nullptr, 0, Form); } +ExprResult +Parser::ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName) { + if (Tok.is(tok::l_paren)) { + BalancedDelimiterTracker Paren(*this, tok::l_paren); + Paren.consumeOpen(); + ExprResult Res = ParseUnevaluatedStringInAttribute(AttrName); + Paren.consumeClose(); + return Res; + } + if (!isTokenStringLiteral()) { + Diag(Tok.getLocation(), diag::err_expected_string_literal) + << /*in attribute...*/ 4 << AttrName.getName(); + return ExprError(); + } + return ParseUnevaluatedStringLiteralExpression(); +} + +bool Parser::ParseAttributeArgumentList( + const IdentifierInfo &AttrName, SmallVectorImpl &Exprs, + ParsedAttributeArgumentsProperties ArgsProperties) { + bool SawError = false; + unsigned Arg = 0; + while (true) { + ExprResult Expr; + if (ArgsProperties.isStringLiteralArg(Arg)) { + Expr = ParseUnevaluatedStringInAttribute(AttrName); + } else if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) { + Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists); + Expr = ParseBraceInitializer(); + } else { + Expr = ParseAssignmentExpression(); + } + Expr = Actions.CorrectDelayedTyposInExpr(Expr); + + if (Tok.is(tok::ellipsis)) + Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken()); + else if (Tok.is(tok::code_completion)) { + // There's nothing to suggest in here as we parsed a full expression. + // Instead fail and propagate the error since caller might have something + // the suggest, e.g. signature help in function call. Note that this is + // performed before pushing the \p Expr, so that signature help can report + // current argument correctly. + SawError = true; + cutOffParsing(); + break; + } + + if (Expr.isInvalid()) { + SawError = true; + break; + } + + Exprs.push_back(Expr.get()); + + if (Tok.isNot(tok::comma)) + break; + // Move to the next argument, remember where the comma was. + Token Comma = Tok; + ConsumeToken(); + checkPotentialAngleBracketDelimiter(Comma); + Arg++; + } + + if (SawError) { + // Ensure typos get diagnosed when errors were encountered while parsing the + // expression list. + for (auto &E : Exprs) { + ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E); + if (Expr.isUsable()) + E = Expr.get(); + } + } + return SawError; +} + unsigned Parser::ParseAttributeArgsCommon( IdentifierInfo *AttrName, SourceLocation AttrNameLoc, ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName, @@ -463,9 +548,9 @@ unsigned Parser::ParseAttributeArgsCommon( : Sema::ExpressionEvaluationContext::ConstantEvaluated); ExprVector ParsedExprs; - if (ParseExpressionList(ParsedExprs, llvm::function_ref(), - /*FailImmediatelyOnInvalidExpr=*/true, - /*EarlyTypoCorrection=*/true)) { + ParsedAttributeArgumentsProperties ArgProperties = + attributeStringLiteralListArg(*AttrName); + if (ParseAttributeArgumentList(*AttrName, ParsedExprs, ArgProperties)) { SkipUntil(tok::r_paren, StopAtSemi); return 0; } diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 75d04824d8b9..3438ab3ed9aa 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -3504,7 +3504,7 @@ bool Parser::ParseExpressionList(SmallVectorImpl &Exprs, Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken()); else if (Tok.is(tok::code_completion)) { // There's nothing to suggest in here as we parsed a full expression. - // Instead fail and propogate the error since caller might have something + // Instead fail and propagate the error since caller might have something // the suggest, e.g. signature help in function call. Note that this is // performed before pushing the \p Expr, so that signature help can report // current argument correctly. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 7f49517f0f1a..72d6944dce93 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -349,7 +349,7 @@ bool Sema::checkStringLiteralArgumentAttr(const AttributeCommonInfo &CI, if (ArgLocation) *ArgLocation = E->getBeginLoc(); - if (!Literal || !Literal->isOrdinary()) { + if (!Literal || (!Literal->isUnevaluated() && !Literal->isOrdinary())) { Diag(E->getBeginLoc(), diag::err_attribute_argument_type) << CI << AANT_ArgumentString; return false; @@ -381,6 +381,16 @@ bool Sema::checkStringLiteralArgumentAttr(const ParsedAttr &AL, unsigned ArgNum, // Now check for an actual string literal. Expr *ArgExpr = AL.getArgAsExpr(ArgNum); + const auto *Literal = dyn_cast(ArgExpr->IgnoreParenCasts()); + if (ArgLocation) + *ArgLocation = ArgExpr->getBeginLoc(); + + if (!Literal || (!Literal->isUnevaluated() && !Literal->isOrdinary())) { + Diag(ArgExpr->getBeginLoc(), diag::err_attribute_argument_type) + << AL << AANT_ArgumentString; + return false; + } + Str = Literal->getString(); return checkStringLiteralArgumentAttr(AL, ArgExpr, Str, ArgLocation); } diff --git a/clang/test/Parser/MicrosoftExtensions.cpp b/clang/test/Parser/MicrosoftExtensions.cpp index 01e59aa1945a..6bf802a29ace 100644 --- a/clang/test/Parser/MicrosoftExtensions.cpp +++ b/clang/test/Parser/MicrosoftExtensions.cpp @@ -44,8 +44,8 @@ typedef struct _GUID unsigned char Data4[8]; } GUID; -struct __declspec(uuid(L"00000000-0000-0000-1234-000000000047")) uuid_attr_bad1 { };// expected-error {{'uuid' attribute requires a string}} -struct __declspec(uuid(3)) uuid_attr_bad2 { };// expected-error {{'uuid' attribute requires a string}} +struct __declspec(uuid(L"00000000-0000-0000-1234-000000000047")) uuid_attr_bad1 { };// expected-warning {{encoding prefix 'L' on an unevaluated string literal has no effect and is incompatible with c++2c}} +struct __declspec(uuid(3)) uuid_attr_bad2 { };// expected-error {{expected string literal as argument of 'uuid' attribute}} struct __declspec(uuid("0000000-0000-0000-1234-0000500000047")) uuid_attr_bad3 { };// expected-error {{uuid attribute contains a malformed GUID}} struct __declspec(uuid("0000000-0000-0000-Z234-000000000047")) uuid_attr_bad4 { };// expected-error {{uuid attribute contains a malformed GUID}} struct __declspec(uuid("000000000000-0000-1234-000000000047")) uuid_attr_bad5 { };// expected-error {{uuid attribute contains a malformed GUID}} diff --git a/clang/test/Parser/c2x-attributes.c b/clang/test/Parser/c2x-attributes.c index 3e9469f8fa8b..be039e40f98e 100644 --- a/clang/test/Parser/c2x-attributes.c +++ b/clang/test/Parser/c2x-attributes.c @@ -16,7 +16,7 @@ enum { [[]] Six }; // expected-error {{expected identifier}} // FIXME: this diagnostic can be improved. enum E3 [[]] { Seven }; // expected-error {{expected identifier or '('}} -[[deprecated([""])]] int WrongArgs; // expected-error {{expected expression}} +[[deprecated([""])]] int WrongArgs; // expected-error {{expected string literal as argument of 'deprecated' attribute}} [[,,,,,]] int Commas1; // ok [[,, maybe_unused]] int Commas2; // ok [[maybe_unused,,,]] int Commas3; // ok @@ -24,6 +24,13 @@ enum E3 [[]] { Seven }; // expected-error {{expected identifier or '('}} [[foo bar]] int NoComma; // expected-error {{expected ','}} \ // expected-warning {{unknown attribute 'foo' ignored}} + +[[deprecated(L"abc")]] void unevaluated_string(void); +// expected-warning@-1 {{encoding prefix 'L' on an unevaluated string literal has no effect}} + +[[nodiscard("\123")]] int unevaluated_string2(void); +// expected-error@-1 {{invalid escape sequence '\123' in an unevaluated string literal}} + struct [[]] S1 { int i [[]]; int [[]] j; diff --git a/clang/test/Parser/cxx-attributes.cpp b/clang/test/Parser/cxx-attributes.cpp index b46326c4d474..51d1f9c82281 100644 --- a/clang/test/Parser/cxx-attributes.cpp +++ b/clang/test/Parser/cxx-attributes.cpp @@ -41,7 +41,7 @@ void fn() { pi = &i[0]; } -[[deprecated([""])]] int WrongArgs; // expected-error {{expected variable name or 'this' in lambda capture list}} +[[deprecated([""])]] int WrongArgs; // expected-error {{expected string literal as argument of 'deprecated' attribute}} [[,,,,,]] int Commas1; // ok [[,, maybe_unused]] int Commas2; // ok [[maybe_unused,,,]] int Commas3; // ok diff --git a/clang/test/Parser/cxx0x-attributes.cpp b/clang/test/Parser/cxx0x-attributes.cpp index 3cb4e180e5f5..10c5bbcac102 100644 --- a/clang/test/Parser/cxx0x-attributes.cpp +++ b/clang/test/Parser/cxx0x-attributes.cpp @@ -94,7 +94,7 @@ class [[]] [[]] final_class_another [[]] [[]] alignas(16) [[]]{}; // expected-error {{an attribute list cannot appear here}} // The diagnostics here don't matter much, this just shouldn't crash: -class C final [[deprecated(l]] {}); // expected-error {{use of undeclared identifier}} expected-error {{expected ']'}} expected-error {{an attribute list cannot appear here}} expected-error {{expected unqualified-id}} +class C final [[deprecated(l]] {}); //expected-error {{expected string literal as argument of 'deprecated' attribute}} expected-error {{an attribute list cannot appear here}} expected-error {{expected unqualified-id}} class D final alignas ([l) {}]{}); // expected-error {{expected ',' or ']' in lambda capture list}} expected-error {{an attribute list cannot appear here}} [[]] struct with_init_declarators {} init_declarator; @@ -266,7 +266,7 @@ template void variadic() { template void variadic_nttp() { void bar [[noreturn...]] (); // expected-error {{attribute 'noreturn' cannot be used as an attribute pack}} - void baz [[clang::no_sanitize(Is...)]] (); // expected-error {{attribute 'no_sanitize' does not support argument pack expansion}} + void baz [[clang::no_sanitize(Is...)]] (); // expected-error {{expected string literal as argument of 'no_sanitize' attribute}} void bor [[clang::annotate("A", "V" ...)]] (); // expected-error {{pack expansion does not contain any unexpanded parameter packs}} void bir [[clang::annotate("B", {1, 2, 3, 4})]] (); // expected-error {{'annotate' attribute requires parameter 1 to be a constant expression}} expected-note {{subexpression not valid in a constant expression}} void boo [[unknown::foo(Is...)]] (); // expected-warning {{unknown attribute 'foo' ignored}} @@ -445,3 +445,9 @@ class Ordering { ) { } }; + +namespace P2361 { +[[deprecated(L"abc")]] void a(); // expected-warning{{encoding prefix 'L' on an unevaluated string literal has no effect and is incompatible with c++2c}} \ + // expected-warning {{use of the 'deprecated' attribute is a C++14 extension}} +[[nodiscard("\123")]] int b(); // expected-error{{invalid escape sequence '\123' in an unevaluated string literal}} +} diff --git a/clang/test/Sema/MicrosoftExtensions.c b/clang/test/Sema/MicrosoftExtensions.c index 50077d903148..cf7463d2e76e 100644 --- a/clang/test/Sema/MicrosoftExtensions.c +++ b/clang/test/Sema/MicrosoftExtensions.c @@ -123,7 +123,7 @@ struct __declspec(deprecated) DS1 { int i; float f; }; // expected-note {{'DS1' #define MY_TEXT "This is also deprecated" __declspec(deprecated(MY_TEXT)) void Dfunc1( void ) {} // expected-note {{'Dfunc1' has been explicitly marked deprecated here}} -struct __declspec(deprecated(123)) DS2 {}; // expected-error {{'deprecated' attribute requires a string}} +struct __declspec(deprecated(123)) DS2 {}; // expected-error {{expected string literal as argument of 'deprecated' attribute}} void test( void ) { e1 = one; // expected-warning {{'e1' is deprecated: This is deprecated}} diff --git a/clang/test/Sema/annotate-type.c b/clang/test/Sema/annotate-type.c index 9fd95f953c5d..901cef7ffa8b 100644 --- a/clang/test/Sema/annotate-type.c +++ b/clang/test/Sema/annotate-type.c @@ -20,10 +20,10 @@ void foo(float *[[clang::annotate_type("foo")]] a) { [[clang::annotate_type("bar")]] int *z1; // expected-error {{'annotate_type' attribute cannot be applied to a declaration}} int *z2 [[clang::annotate_type("bar")]]; // expected-error {{'annotate_type' attribute cannot be applied to a declaration}} [[clang::annotate_type("bar")]]; // expected-error {{'annotate_type' attribute cannot be applied to a statement}} - int *[[clang::annotate_type(1)]] z3; // expected-error {{'annotate_type' attribute requires a string}} + int *[[clang::annotate_type(1)]] z3; // expected-error {{expected string literal as argument of 'annotate_type' attribute}} int *[[clang::annotate_type()]] z4; // expected-error {{'annotate_type' attribute takes at least 1 argument}} int *[[clang::annotate_type]] z5; // expected-error {{'annotate_type' attribute takes at least 1 argument}} - int *[[clang::annotate_type(some_function())]] z6; // expected-error {{'annotate_type' attribute requires a string}} + int *[[clang::annotate_type(some_function())]] z6; // expected-error {{expected string literal as argument of 'annotate_type' attribute}} int *[[clang::annotate_type("bar", some_function())]] z7; // expected-error {{'annotate_type' attribute requires parameter 1 to be a constant expression}} expected-note{{subexpression not valid in a constant expression}} int *[[clang::annotate_type("bar", z7)]] z8; // expected-error {{'annotate_type' attribute requires parameter 1 to be a constant expression}} expected-note{{subexpression not valid in a constant expression}} int *[[clang::annotate_type("bar", int)]] z9; // expected-error {{expected expression}} diff --git a/clang/test/Sema/annotate.c b/clang/test/Sema/annotate.c index 2e7a37936fed..b4551a102e61 100644 --- a/clang/test/Sema/annotate.c +++ b/clang/test/Sema/annotate.c @@ -3,8 +3,8 @@ void __attribute__((annotate("foo"))) foo(float *a) { __attribute__((annotate("bar"))) int x; [[clang::annotate("bar")]] int x2; - __attribute__((annotate(1))) int y; // expected-error {{'annotate' attribute requires a string}} - [[clang::annotate(1)]] int y2; // expected-error {{'annotate' attribute requires a string}} + __attribute__((annotate(1))) int y; // expected-error {{expected string literal as argument of 'annotate' attribute}} + [[clang::annotate(1)]] int y2; // expected-error {{expected string literal as argument of 'annotate' attribute}} __attribute__((annotate("bar", 1))) int z; [[clang::annotate("bar", 1)]] int z2; diff --git a/clang/test/Sema/attr-assume.c b/clang/test/Sema/attr-assume.c index cb07940d02b5..98deffa3a746 100644 --- a/clang/test/Sema/attr-assume.c +++ b/clang/test/Sema/attr-assume.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -triple i386-apple-darwin9 -fsyntax-only -verify %s -void f1(void) __attribute__((assume(3))); // expected-error {{'assume' attribute requires a string}} -void f2(void) __attribute__((assume(int))); // expected-error {{expected expression}} -void f3(void) __attribute__((assume(for))); // expected-error {{expected expression}} +void f1(void) __attribute__((assume(3))); // expected-error {{expected string literal as argument of 'assume' attribute}} +void f2(void) __attribute__((assume(int))); // expected-error {{expected string literal as argument of 'assume' attribute}} +void f3(void) __attribute__((assume(for))); // expected-error {{expected string literal as argument of 'assume' attribute}} void f4(void) __attribute__((assume("QQQQ"))); // expected-warning {{unknown assumption string 'QQQQ'; attribute is potentially ignored}} void f5(void) __attribute__((assume("omp_no_openmp"))); void f6(void) __attribute__((assume("omp_noopenmp"))); // expected-warning {{unknown assumption string 'omp_noopenmp' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}} @@ -10,5 +10,5 @@ void f7(void) __attribute__((assume("omp_no_openmp_routine"))); // expected-warn void f8(void) __attribute__((assume("omp_no_openmp1"))); // expected-warning {{unknown assumption string 'omp_no_openmp1' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}} void f9(void) __attribute__((assume("omp_no_openmp", "omp_no_openmp"))); // expected-error {{'assume' attribute takes one argument}} -int g1 __attribute__((assume(0))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}} +int g1 __attribute__((assume(0))); // expected-error {{expected string literal as argument of 'assume' attribute}} int g2 __attribute__((assume("omp_no_openmp"))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}} diff --git a/clang/test/Sema/attr-btf_tag.c b/clang/test/Sema/attr-btf_tag.c index d33ccdf96282..cbb21a5a88bb 100644 --- a/clang/test/Sema/attr-btf_tag.c +++ b/clang/test/Sema/attr-btf_tag.c @@ -21,7 +21,7 @@ struct __tag2 __tag3 t2 { int g1 __tag1; int g2 __tag_no_arg; // expected-error {{'btf_decl_tag' attribute takes one argument}} int g3 __tag_2_arg; // expected-error {{'btf_decl_tag' attribute takes one argument}} -int i1 __invalid; // expected-error {{'btf_decl_tag' attribute requires a string}} +int i1 __invalid; // expected-error {{expected string literal as argument of 'btf_decl_tag' attribute}} enum e1 { E1 diff --git a/clang/test/Sema/attr-btf_type_tag.c b/clang/test/Sema/attr-btf_type_tag.c index a3ef404f3823..aa3230a3e9b7 100644 --- a/clang/test/Sema/attr-btf_type_tag.c +++ b/clang/test/Sema/attr-btf_type_tag.c @@ -8,7 +8,7 @@ #define __tag6 __attribute__((btf_type_tag("tag6"))) int __attribute__((btf_type_tag("tag1", "tag2"))) *invalid1; // expected-error {{'btf_type_tag' attribute takes one argument}} -int __attribute__((btf_type_tag(2))) *invalid2; // expected-error {{'btf_type_tag' attribute requires a string}} +int __attribute__((btf_type_tag(2))) *invalid2; // expected-error {{expected string literal as argument of 'btf_type_tag' attribute}} int * __tag1 __tag2 * __tag3 __tag4 * __tag5 __tag6 *g; diff --git a/clang/test/Sema/attr-capabilities.c b/clang/test/Sema/attr-capabilities.c index b28c437935a1..5138803bd5eb 100644 --- a/clang/test/Sema/attr-capabilities.c +++ b/clang/test/Sema/attr-capabilities.c @@ -17,8 +17,8 @@ int Test3 __attribute__((acquire_capability("test3"))); // expected-warning {{' int Test4 __attribute__((try_acquire_capability("test4"))); // expected-error {{'try_acquire_capability' attribute only applies to functions}} int Test5 __attribute__((release_capability("test5"))); // expected-warning {{'release_capability' attribute only applies to functions}} -struct __attribute__((capability(12))) Test3 {}; // expected-error {{'capability' attribute requires a string}} -struct __attribute__((shared_capability(Test2))) Test4 {}; // expected-error {{'shared_capability' attribute requires a string}} +struct __attribute__((capability(12))) Test3 {}; // expected-error {{expected string literal as argument of 'capability' attribute}} +struct __attribute__((shared_capability(Test2))) Test4 {}; // expected-error {{expected string literal as argument of 'shared_capability' attribute}} struct __attribute__((capability)) Test5 {}; // expected-error {{'capability' attribute takes one argument}} struct __attribute__((shared_capability("test1", 12))) Test6 {}; // expected-error {{'shared_capability' attribute takes one argument}} diff --git a/clang/test/Sema/attr-enforce-tcb-errors.cpp b/clang/test/Sema/attr-enforce-tcb-errors.cpp index 1ce147ab32df..b5effe2fe518 100644 --- a/clang/test/Sema/attr-enforce-tcb-errors.cpp +++ b/clang/test/Sema/attr-enforce-tcb-errors.cpp @@ -6,14 +6,14 @@ void no_arguments() __attribute__((enforce_tcb)); // expected-error{{'enforce_tc void too_many_arguments() __attribute__((enforce_tcb("test", 12))); // expected-error{{'enforce_tcb' attribute takes one argument}} -void wrong_argument_type() __attribute__((enforce_tcb(12))); // expected-error{{'enforce_tcb' attribute requires a string}} +void wrong_argument_type() __attribute__((enforce_tcb(12))); // expected-error{{expected string literal as argument of 'enforce_tcb' attribute}} [[clang::enforce_tcb_leaf("oops")]] int wrong_subject_type_leaf; // expected-warning{{'enforce_tcb_leaf' attribute only applies to functions}} void no_arguments_leaf() __attribute__((enforce_tcb_leaf)); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}} void too_many_arguments_leaf() __attribute__((enforce_tcb_leaf("test", 12))); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}} -void wrong_argument_type_leaf() __attribute__((enforce_tcb_leaf(12))); // expected-error{{'enforce_tcb_leaf' attribute requires a string}} +void wrong_argument_type_leaf() __attribute__((enforce_tcb_leaf(12))); // expected-error{{expected string literal as argument of 'enforce_tcb_leaf' attribute}} void foo(); diff --git a/clang/test/Sema/attr-enforce-tcb-errors.m b/clang/test/Sema/attr-enforce-tcb-errors.m index f82d38c919d1..c8d0716553cc 100644 --- a/clang/test/Sema/attr-enforce-tcb-errors.m +++ b/clang/test/Sema/attr-enforce-tcb-errors.m @@ -20,13 +20,13 @@ __attribute__((objc_root_class)) - (void)tooManyArguments __attribute__((enforce_tcb("test", 12))); // expected-error{{'enforce_tcb' attribute takes one argument}} -- (void)wrongArgumentType __attribute__((enforce_tcb(12))); // expected-error{{'enforce_tcb' attribute requires a string}} +- (void)wrongArgumentType __attribute__((enforce_tcb(12))); // expected-error{{expected string literal as argument of 'enforce_tcb' attribute}} - (void)noArgumentsLeaf __attribute__((enforce_tcb_leaf)); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}} - (void)tooManyArgumentsLeaf __attribute__((enforce_tcb_leaf("test", 12))); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}} -- (void)wrongArgumentTypeLeaf __attribute__((enforce_tcb_leaf(12))); // expected-error{{'enforce_tcb_leaf' attribute requires a string}} +- (void)wrongArgumentTypeLeaf __attribute__((enforce_tcb_leaf(12))); // expected-error{{expected string literal as argument of 'enforce_tcb_leaf' attribute}} @end @implementation AClass diff --git a/clang/test/Sema/attr-error.c b/clang/test/Sema/attr-error.c index 581bfc43cbc0..9f500a87be14 100644 --- a/clang/test/Sema/attr-error.c +++ b/clang/test/Sema/attr-error.c @@ -15,7 +15,7 @@ int bad2(void) { __attribute__((error("bad2"))); // expected-error {{'error' attribute cannot be applied to a statement}} } -__attribute__((error(3))) // expected-error {{'error' attribute requires a string}} +__attribute__((error(3))) // expected-error {{expected string literal as argument of 'error' attribute}} int bad3(void); diff --git a/clang/test/Sema/attr-handles.cpp b/clang/test/Sema/attr-handles.cpp index 135467b6c0a8..ff1c1f68dfec 100644 --- a/clang/test/Sema/attr-handles.cpp +++ b/clang/test/Sema/attr-handles.cpp @@ -6,7 +6,7 @@ void (*fp)(int handle [[clang::use_handle("Fuchsia")]]); auto lambda = [](int handle [[clang::use_handle("Fuchsia")]]){}; void g(int a __attribute__((acquire_handle("Fuchsia")))); // expected-error {{attribute only applies to output parameters}} void h(int *a __attribute__((acquire_handle))); // expected-error {{'acquire_handle' attribute takes one argument}} -void h(int *a __attribute__((acquire_handle(1)))); // expected-error {{attribute requires a string}} +void h(int *a __attribute__((acquire_handle(1)))); // expected-error {{expected string literal as argument of 'acquire_handle' attribute}} void h(int *a __attribute__((acquire_handle("RandomString", "AndAnother")))); // expected-error {{'acquire_handle' attribute takes one argument}} __attribute__((release_handle("Fuchsia"))) int i(); // expected-warning {{'release_handle' attribute only applies to parameters}} __attribute__((use_handle("Fuchsia"))) int j(); // expected-warning {{'use_handle' attribute only applies to parameters}} diff --git a/clang/test/Sema/attr-section.c b/clang/test/Sema/attr-section.c index 3ea922c91947..1f058c24f980 100644 --- a/clang/test/Sema/attr-section.c +++ b/clang/test/Sema/attr-section.c @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-apple-darwin9 %s int x __attribute__((section( - 42))); // expected-error {{'section' attribute requires a string}} + 42))); // expected-error {{expected string literal as argument of 'section' attribute}} // rdar://4341926 diff --git a/clang/test/Sema/attr-tls_model.c b/clang/test/Sema/attr-tls_model.c index 9106d39b55a3..ec99c7aacaa3 100644 --- a/clang/test/Sema/attr-tls_model.c +++ b/clang/test/Sema/attr-tls_model.c @@ -10,5 +10,5 @@ int x __attribute((tls_model("global-dynamic"))); // expected-error {{'tls_model static __thread int y __attribute((tls_model("global-dynamic"))); // no-warning static __thread int y __attribute((tls_model("local", "dynamic"))); // expected-error {{'tls_model' attribute takes one argument}} -static __thread int y __attribute((tls_model(123))); // expected-error {{'tls_model' attribute requires a string}} +static __thread int y __attribute((tls_model(123))); // expected-error {{expected string literal as argument of 'tls_model' attribute}} static __thread int y __attribute((tls_model("foobar"))); // expected-error {{tls_model must be "global-dynamic", "local-dynamic", "initial-exec" or "local-exec"}} diff --git a/clang/test/Sema/attr-unavailable-message.c b/clang/test/Sema/attr-unavailable-message.c index b4ae70e0a39c..0caa943ad8a4 100644 --- a/clang/test/Sema/attr-unavailable-message.c +++ b/clang/test/Sema/attr-unavailable-message.c @@ -8,7 +8,7 @@ double dfoo(double) __attribute__((__unavailable__("NO LONGER"))); // expected- void bar(void) __attribute__((__unavailable__)); // expected-note {{explicitly marked unavailable}} -int quux(void) __attribute__((__unavailable__(12))); // expected-error {{'__unavailable__' attribute requires a string}} +int quux(void) __attribute__((__unavailable__(12))); // expected-error {{expected string literal as argument of '__unavailable__' attribute}} #define ACCEPTABLE "Use something else" int quux2(void) __attribute__((__unavailable__(ACCEPTABLE))); diff --git a/clang/test/Sema/attr-warning.c b/clang/test/Sema/attr-warning.c index 0973f3c8eb4b..7510e88d291b 100644 --- a/clang/test/Sema/attr-warning.c +++ b/clang/test/Sema/attr-warning.c @@ -15,7 +15,7 @@ int bad2(void) { __attribute__((warning("bad2"))); // expected-error {{'warning' attribute cannot be applied to a statement}} } -__attribute__((warning(3))) // expected-error {{'warning' attribute requires a string}} +__attribute__((warning(3))) // expected-error {{expected string literal as argument of 'warning' attribute}} int bad3(void); diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c index 6297545781f9..4df39916c031 100644 --- a/clang/test/Sema/diagnose_if.c +++ b/clang/test/Sema/diagnose_if.c @@ -6,7 +6,7 @@ void failure1(void) _diagnose_if(); // expected-error{{exactly 3 arguments}} void failure2(void) _diagnose_if(0); // expected-error{{exactly 3 arguments}} void failure3(void) _diagnose_if(0, ""); // expected-error{{exactly 3 arguments}} void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{exactly 3 arguments}} -void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{requires a string}} +void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{expected string literal as argument of 'diagnose_if' attribute}} void failure6(void) _diagnose_if(0, "", "invalid"); // expected-error{{invalid diagnostic type for 'diagnose_if'; use "error" or "warning" instead}} void failure7(void) _diagnose_if(0, "", "ERROR"); // expected-error{{invalid diagnostic type}} void failure8(int a) _diagnose_if(a, "", ""); // expected-error{{invalid diagnostic type}} diff --git a/clang/test/Sema/enable_if.c b/clang/test/Sema/enable_if.c index 22eb84fa7275..9d46c71274d6 100644 --- a/clang/test/Sema/enable_if.c +++ b/clang/test/Sema/enable_if.c @@ -105,7 +105,7 @@ __attribute__((enable_if(n == 0, "chosen when 'n' is zero"))) void f1(int n); // int n __attribute__((enable_if(1, "always chosen"))); // expected-warning{{'enable_if' attribute only applies to functions}} -void f(int n) __attribute__((enable_if("chosen when 'n' is zero", n == 0))); // expected-error{{'enable_if' attribute requires a string}} +void f(int n) __attribute__((enable_if("chosen when 'n' is zero", n == 0))); // expected-error{{expected string literal as argument of 'enable_if' attribute}} void f(int n) __attribute__((enable_if())); // expected-error{{'enable_if' attribute requires exactly 2 arguments}} diff --git a/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp b/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp index 54d0f9e74f34..8c9d9b29c553 100644 --- a/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp +++ b/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp @@ -5,13 +5,13 @@ #endif int a1 [[deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}} -int a2 [[deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}} +int a2 [[deprecated("warning", 1)]]; // expected-error{{expected string literal as argument of 'deprecated' attribute}} int b1 [[gnu::deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}} -int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}} +int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{expected string literal as argument of 'deprecated' attribute}} __declspec(deprecated("warning", "fixit")) int c1; // expected-error{{'deprecated' attribute takes no more than 1 argument}} -__declspec(deprecated("warning", 1)) int c2; // expected-error{{'deprecated' attribute takes no more than 1 argument}} +__declspec(deprecated("warning", 1)) int c2; // expected-error{{expected string literal as argument of 'deprecated' attribute}} int d1 __attribute__((deprecated("warning", "fixit"))); -int d2 __attribute__((deprecated("warning", 1))); // expected-error{{'deprecated' attribute requires a string}} +int d2 __attribute__((deprecated("warning", 1))); // expected-error{{expected string literal as argument of 'deprecated' attribute}} diff --git a/clang/test/SemaCXX/attr-no-sanitize.cpp b/clang/test/SemaCXX/attr-no-sanitize.cpp index 9e13fd3c0270..a464947fe5a3 100644 --- a/clang/test/SemaCXX/attr-no-sanitize.cpp +++ b/clang/test/SemaCXX/attr-no-sanitize.cpp @@ -4,7 +4,7 @@ int f1() __attribute__((no_sanitize)); // expected-error{{'no_sanitize' attribute takes at least 1 argument}} -int f2() __attribute__((no_sanitize(1))); // expected-error{{'no_sanitize' attribute requires a string}} +int f2() __attribute__((no_sanitize(1))); // expected-error{{expected string literal as argument of 'no_sanitize' attribute}} __attribute__((no_sanitize("all"))) int global; // expected-warning{{'no_sanitize' attribute argument 'all' not supported on a global variable}} __attribute__((no_sanitize("unknown"))) int global2; // expected-warning{{unknown sanitizer 'unknown' ignored}} diff --git a/clang/test/SemaCXX/attr-section.cpp b/clang/test/SemaCXX/attr-section.cpp index 12c0da283997..62b597b4ff36 100644 --- a/clang/test/SemaCXX/attr-section.cpp +++ b/clang/test/SemaCXX/attr-section.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-linux-gnu %s int x __attribute__((section( - 42))); // expected-error {{'section' attribute requires a string}} + 42))); // expected-error {{expected string literal as argument of 'section' attribute}} // PR6007 diff --git a/clang/test/SemaCXX/attr-weakref.cpp b/clang/test/SemaCXX/attr-weakref.cpp index 46ca5ab20682..4e1834096ebe 100644 --- a/clang/test/SemaCXX/attr-weakref.cpp +++ b/clang/test/SemaCXX/attr-weakref.cpp @@ -33,6 +33,6 @@ int a9 __attribute__((weakref)); // expected-error {{weakref declaration of 'a9 static int a10(); int a10() __attribute__((weakref ("foo"))); -static int v __attribute__((weakref(a1), alias("foo"))); // expected-error {{'weakref' attribute requires a string}} +static int v __attribute__((weakref(a1), alias("foo"))); // expected-error {{expected string literal as argument of 'weakref' attribute}} __attribute__((weakref ("foo"))) auto a11 = 1; // expected-error {{weakref declaration must have internal linkage}} diff --git a/clang/test/SemaCXX/suppress.cpp b/clang/test/SemaCXX/suppress.cpp index d88ae0bbca00..29544b3c573c 100644 --- a/clang/test/SemaCXX/suppress.cpp +++ b/clang/test/SemaCXX/suppress.cpp @@ -16,7 +16,7 @@ void f_() { [[gsl::suppress]] int x; // expected-error {{'suppress' attribute takes at least 1 argument}} [[gsl::suppress()]] int y; // expected-error {{'suppress' attribute takes at least 1 argument}} int [[gsl::suppress("r")]] z; // expected-error {{'suppress' attribute cannot be applied to types}} - [[gsl::suppress(f_)]] float f; // expected-error {{'suppress' attribute requires a string}} + [[gsl::suppress(f_)]] float f; // expected-error {{expected string literal as argument of 'suppress' attribut}} } union [[gsl::suppress("type.1")]] U { diff --git a/clang/test/SemaObjC/attr-swift_bridge.m b/clang/test/SemaObjC/attr-swift_bridge.m index 8f53ed815458..0464e3f326e0 100644 --- a/clang/test/SemaObjC/attr-swift_bridge.m +++ b/clang/test/SemaObjC/attr-swift_bridge.m @@ -5,7 +5,7 @@ __attribute__((__swift_bridge__)) @interface I @end -// expected-error@+1 {{'__swift_bridge__' attribute requires a string}} +// expected-error@+1 {{expected string literal as argument of '__swift_bridge__' attribute}} __attribute__((__swift_bridge__(1))) @interface J @end diff --git a/clang/test/SemaObjC/objc-asm-attribute-neg-test.m b/clang/test/SemaObjC/objc-asm-attribute-neg-test.m index 9941189357ba..ac3871969fe2 100644 --- a/clang/test/SemaObjC/objc-asm-attribute-neg-test.m +++ b/clang/test/SemaObjC/objc-asm-attribute-neg-test.m @@ -5,7 +5,7 @@ __attribute__((objc_runtime_name)) // expected-error {{'objc_runtime_name' attri @interface BInterface @end -__attribute__((objc_runtime_name(123))) // expected-error {{'objc_runtime_name' attribute requires a string}} +__attribute__((objc_runtime_name(123))) // expected-error {{expected string literal as argument of 'objc_runtime_name' attribute}} @protocol BProtocol1 @end @@ -14,7 +14,7 @@ __attribute__((objc_runtime_name("MySecretNamespace.Protocol"))) @end __attribute__((objc_runtime_name("MySecretNamespace.Message"))) -@interface Message { +@interface Message { __attribute__((objc_runtime_name("MySecretNamespace.Message"))) // expected-error {{'objc_runtime_name' attribute only applies to Objective-C interfaces and Objective-C protocols}} id MyIVAR; } diff --git a/clang/test/SemaObjC/validate-attr-swift_attr.m b/clang/test/SemaObjC/validate-attr-swift_attr.m index 4ff434d17972..2c73b0a89272 100644 --- a/clang/test/SemaObjC/validate-attr-swift_attr.m +++ b/clang/test/SemaObjC/validate-attr-swift_attr.m @@ -5,7 +5,7 @@ __attribute__((swift_attr)) @interface I @end -// expected-error@+1 {{'swift_attr' attribute requires a string}} +// expected-error@+1 {{expected string literal as argument of 'swift_attr' attribute}} __attribute__((swift_attr(1))) @interface J @end diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp index a7081e83470a..9fd448a5e935 100644 --- a/clang/test/SemaTemplate/attributes.cpp +++ b/clang/test/SemaTemplate/attributes.cpp @@ -25,7 +25,7 @@ namespace attribute_aligned { { __attribute__((aligned(Align))) char storage[Size]; }; - + template class C { public: @@ -95,11 +95,11 @@ void UseAnnotations() { HasAnnotations(); } template [[clang::annotate("ANNOTATE_BAZ", Is...)]] void HasPackAnnotations(); void UsePackAnnotations() { HasPackAnnotations<1, 2, 3>(); } -template [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{'annotate' attribute takes at least 1 argument}} expected-error {{'annotate' attribute requires a string}} +template [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{expected string literal as argument of 'annotate' attribute}} void UseOnlyPackAnnotations() { - HasOnlyPackAnnotation<>(); // expected-note {{in instantiation of function template specialization 'attribute_annotate::HasOnlyPackAnnotation<>' requested here}} - HasOnlyPackAnnotation<1>(); // expected-note {{in instantiation of function template specialization 'attribute_annotate::HasOnlyPackAnnotation<1>' requested here}} + HasOnlyPackAnnotation<>(); + HasOnlyPackAnnotation<1>(); } // CHECK: ClassTemplateDecl {{.*}} AnnotatedPackTemplateStruct @@ -276,40 +276,21 @@ void UseOnlyPackAnnotations() { // CHECK-NEXT: value: Int 6 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct -// CHECK-NEXT: ClassTemplatePartialSpecializationDecl {{.*}} struct AnnotatedPackTemplateStruct definition -// CHECK-NEXT: DefinitionData -// CHECK-NEXT: DefaultConstructor -// CHECK-NEXT: CopyConstructor -// CHECK-NEXT: MoveConstructor -// CHECK-NEXT: CopyAssignment -// CHECK-NEXT: MoveAssignment -// CHECK-NEXT: Destructor -// CHECK-NEXT: TemplateArgument{{.*}} type 'char' -// CHECK-NEXT: BuiltinType {{.*}} 'char' -// CHECK-NEXT: TemplateArgument{{.*}} pack -// CHECK-NEXT: TemplateArgument{{.*}} expr -// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' -// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' -// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is -// CHECK-NEXT: AnnotateAttr {{.*}} "" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' -// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' -// CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct template struct [[clang::annotate("ANNOTATE_FOZ", Is...)]] AnnotatedPackTemplateStruct{}; template struct [[clang::annotate("ANNOTATE_BOO", Is...)]] AnnotatedPackTemplateStruct{}; template struct [[clang::annotate("ANNOTATE_FOZ", 4, 5, 6)]] AnnotatedPackTemplateStruct{}; -template struct [[clang::annotate(Is...)]] AnnotatedPackTemplateStruct{}; // expected-error {{'annotate' attribute requires a string}} expected-error {{'annotate' attribute takes at least 1 argument}} +template struct [[clang::annotate(Is...)]] AnnotatedPackTemplateStruct{}; // expected-error {{expected string literal as argument of 'annotate' attribute}} void UseAnnotatedPackTemplateStructSpecializations() { AnnotatedPackTemplateStruct Instance1{}; AnnotatedPackTemplateStruct Instance2{}; AnnotatedPackTemplateStruct Instance3{}; - AnnotatedPackTemplateStruct Instance4{}; // expected-note {{in instantiation of template class 'attribute_annotate::AnnotatedPackTemplateStruct' requested here}} - AnnotatedPackTemplateStruct Instance5{}; // expected-note {{in instantiation of template class 'attribute_annotate::AnnotatedPackTemplateStruct' requested here}} + AnnotatedPackTemplateStruct Instance4{}; + AnnotatedPackTemplateStruct Instance5{}; } // CHECK: ClassTemplateDecl {{.*}} InvalidAnnotatedPackTemplateStruct // CHECK-NEXT: TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T -// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 1 ... Is +// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} 'int' depth 0 index 1 ... Is // CHECK-NEXT: CXXRecordDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition // CHECK-NEXT: DefinitionData // CHECK-NEXT: DefaultConstructor @@ -318,9 +299,6 @@ void UseAnnotatedPackTemplateStructSpecializations() { // CHECK-NEXT: CopyAssignment // CHECK-NEXT: MoveAssignment // CHECK-NEXT: Destructor -// CHECK-NEXT: AnnotateAttr {{.*}} "" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' -// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct // CHECK-NEXT: ClassTemplateSpecialization {{.*}} 'InvalidAnnotatedPackTemplateStruct' // CHECK-NEXT: ClassTemplateSpecializationDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition @@ -446,7 +424,7 @@ void UseAnnotatedPackTemplateStructSpecializations() { // CHECK-NEXT: TemplateArgument{{.*}} integral 6 // CHECK-NEXT: TemplateArgument{{.*}} integral 7 // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct -template struct [[clang::annotate(Is...)]] InvalidAnnotatedPackTemplateStruct{}; // expected-error {{'annotate' attribute requires a string}} expected-error {{'annotate' attribute takes at least 1 argument}} +template struct InvalidAnnotatedPackTemplateStruct{}; template struct [[clang::annotate("ANNOTATE_BIR", Is...)]] InvalidAnnotatedPackTemplateStruct{}; template struct InvalidAnnotatedPackTemplateStruct {}; template <> struct InvalidAnnotatedPackTemplateStruct {}; @@ -454,8 +432,8 @@ void UseInvalidAnnotatedPackTemplateStruct() { InvalidAnnotatedPackTemplateStruct Instance1{}; InvalidAnnotatedPackTemplateStruct Instance2{}; InvalidAnnotatedPackTemplateStruct Instance3{}; - InvalidAnnotatedPackTemplateStruct Instance4{}; // expected-note {{in instantiation of template class 'attribute_annotate::InvalidAnnotatedPackTemplateStruct' requested here}} - InvalidAnnotatedPackTemplateStruct Instance5{}; // expected-note {{in instantiation of template class 'attribute_annotate::InvalidAnnotatedPackTemplateStruct' requested here}} + InvalidAnnotatedPackTemplateStruct Instance4{}; + InvalidAnnotatedPackTemplateStruct Instance5{}; } // CHECK: FunctionTemplateDecl {{.*}} RedeclaredAnnotatedFunc diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 79db17501b64..529db64cf5cc 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -2297,6 +2297,22 @@ static bool isVariadicExprArgument(const Record *Arg) { .Default(false); } +static bool isStringLiteralArgument(const Record *Arg) { + return !Arg->getSuperClasses().empty() && + llvm::StringSwitch( + Arg->getSuperClasses().back().first->getName()) + .Case("StringArgument", true) + .Default(false); +} + +static bool isVariadicStringLiteralArgument(const Record *Arg) { + return !Arg->getSuperClasses().empty() && + llvm::StringSwitch( + Arg->getSuperClasses().back().first->getName()) + .Case("VariadicStringArgument", true) + .Default(false); +} + static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records, raw_ostream &OS) { OS << "#if defined(CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST)\n"; @@ -2317,6 +2333,34 @@ static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records, OS << "#endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST\n\n"; } +// Emits the list of arguments that should be parsed as unevaluated string +// literals for each attribute. +static void emitClangAttrUnevaluatedStringLiteralList(RecordKeeper &Records, + raw_ostream &OS) { + OS << "#if defined(CLANG_ATTR_STRING_LITERAL_ARG_LIST)\n"; + std::vector Attrs = Records.getAllDerivedDefinitions("Attr"); + for (const auto *Attr : Attrs) { + std::vector Args = Attr->getValueAsListOfDefs("Args"); + uint32_t Bits = 0; + assert(Args.size() <= 32 && "unsupported number of arguments in attribute"); + for (uint32_t N = 0; N < Args.size(); ++N) { + Bits |= (isStringLiteralArgument(Args[N]) << N); + // If we have a variadic string argument, set all the remaining bits to 1 + if (isVariadicStringLiteralArgument(Args[N])) { + Bits |= maskTrailingZeros(N); + break; + } + } + if (!Bits) + continue; + // All these spellings have at least one string literal has argument. + forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) { + OS << ".Case(\"" << S.name() << "\", " << Bits << ")\n"; + }); + } + OS << "#endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST\n\n"; +} + // Emits the first-argument-is-identifier property for attributes. static void emitClangAttrIdentifierArgList(RecordKeeper &Records, raw_ostream &OS) { OS << "#if defined(CLANG_ATTR_IDENTIFIER_ARG_LIST)\n"; @@ -4616,6 +4660,7 @@ void EmitClangAttrParserStringSwitches(RecordKeeper &Records, emitSourceFileHeader("Parser-related llvm::StringSwitch cases", OS); emitClangAttrArgContextList(Records, OS); emitClangAttrIdentifierArgList(Records, OS); + emitClangAttrUnevaluatedStringLiteralList(Records, OS); emitClangAttrVariadicIdentifierArgList(Records, OS); emitClangAttrThisIsaIdentifierArgList(Records, OS); emitClangAttrAcceptsExprPack(Records, OS); diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 23dc53b5a3cb..08ac5b9a5db2 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -115,12 +115,7 @@ C++23, informally referred to as C++26.

Unevaluated strings P2361R6 - -
- Clang 17 (Partial) - Attributes arguments don't yet parse as unevaluated string literals. -
- + Clang 18 Add @, $, and ` to the basic character set -- Gitee From 9363509b463d60716e44b7162fdf18fa9a6120c5 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 22 Aug 2023 17:51:57 +0100 Subject: [PATCH 15/77] [AArch64] Add extra SME attribute tests for expanded intrinsics. NFC See D136361. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../AArch64/sme-disable-gisel-fisel.ll | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 979ac24e3869..307a56fda9a5 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -357,3 +357,66 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw %res = fadd fp128 %a, %b ret fp128 %res } + +; FIXME: As above this should use Selection DAG to make sure the libcall call is lowered correctly. +define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nounwind { +; CHECK-FISEL-LABEL: frem_call_za: +; CHECK-FISEL: // %bb.0: +; CHECK-FISEL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-FISEL-NEXT: mov x29, sp +; CHECK-FISEL-NEXT: sub sp, sp, #16 +; CHECK-FISEL-NEXT: rdsvl x8, #1 +; CHECK-FISEL-NEXT: mov x9, sp +; CHECK-FISEL-NEXT: mul x8, x8, x8 +; CHECK-FISEL-NEXT: sub x9, x9, x8 +; CHECK-FISEL-NEXT: mov sp, x9 +; CHECK-FISEL-NEXT: stur x9, [x29, #-16] +; CHECK-FISEL-NEXT: sub x9, x29, #16 +; CHECK-FISEL-NEXT: sturh w8, [x29, #-8] +; CHECK-FISEL-NEXT: msr TPIDR2_EL0, x9 +; CHECK-FISEL-NEXT: bl fmod +; CHECK-FISEL-NEXT: smstart za +; CHECK-FISEL-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-FISEL-NEXT: sub x0, x29, #16 +; CHECK-FISEL-NEXT: cbnz x8, .LBB10_2 +; CHECK-FISEL-NEXT: // %bb.1: +; CHECK-FISEL-NEXT: bl __arm_tpidr2_restore +; CHECK-FISEL-NEXT: .LBB10_2: +; CHECK-FISEL-NEXT: msr TPIDR2_EL0, xzr +; CHECK-FISEL-NEXT: mov sp, x29 +; CHECK-FISEL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-FISEL-NEXT: ret +; +; CHECK-GISEL-LABEL: frem_call_za: +; CHECK-GISEL: // %bb.0: +; CHECK-GISEL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GISEL-NEXT: bl fmod +; CHECK-GISEL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ret + %res = frem double %a, %b + ret double %res +} + +; FIXME: As above this should use Selection DAG to make sure the libcall is lowered correctly. +define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind { +; CHECK-COMMON-LABEL: frem_call_sm: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl fmodf +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret + %res = frem float %a, %b + ret float %res +} + +; FIXME: As above this should use Selection DAG to make sure the libcall is lowered correctly. +define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind { +; CHECK-COMMON-LABEL: frem_call_sm_compat: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl fmodf +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret + %res = frem float %a, %b + ret float %res +} -- Gitee From aed1a79f8e1142657dea35c8573e6039b8c774f4 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 22 Aug 2023 18:06:27 +0100 Subject: [PATCH 16/77] [AArch64] Disable GlobalISel/FastISel for more SME functions The patch D136361 disabled GlobalISel and FastISel for some SME functions, as the saving and restoring of SM is not yet handled. There were several tests added for fp128 fadd, which will be expanded to a libcall, that only happened to work by accident and did not handle other cases such as f32/f64 frem libcalls. This extends the cases where GlobalISel / FastISel is disabled for functions with SME attributes, under the assumption that it is difficult to tell what will become a libcall reliably, and so should fall back for all function until GlobalISel and/or FastISel can handle them. Differential Revision: https://reviews.llvm.org/D158490 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64FastISel.cpp | 4 +- .../AArch64/GISel/AArch64CallLowering.cpp | 4 +- .../AArch64/sme-disable-gisel-fisel.ll | 187 +++++++++--------- 3 files changed, 100 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 1ae3709e9588..b2c46939e584 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -5187,8 +5187,8 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) { SMEAttrs CallerAttrs(*FuncInfo.Fn); - if (CallerAttrs.hasZAState() || - (!CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody())) + if (CallerAttrs.hasZAState() || CallerAttrs.hasStreamingInterfaceOrBody() || + CallerAttrs.hasStreamingCompatibleInterface()) return nullptr; return new AArch64FastISel(FuncInfo, LibInfo); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index c56e3373d3a7..4cf7e0bd7be4 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -532,8 +532,8 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const { } SMEAttrs Attrs(F); - if (Attrs.hasNewZAInterface() || - (!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody())) + if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() || + Attrs.hasStreamingCompatibleInterface()) return true; return false; diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 307a56fda9a5..0799b2a6edbe 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -68,57 +68,31 @@ entry: define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" { -; CHECK-FISEL-LABEL: streaming_caller_nonstreaming_callee: -; CHECK-FISEL: // %bb.0: // %entry -; CHECK-FISEL-NEXT: sub sp, sp, #96 -; CHECK-FISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: smstop sm -; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: bl normal_callee -; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: smstart sm -; CHECK-FISEL-NEXT: adrp x8, .LCPI1_0 -; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI1_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: fadd d0, d1, d0 -; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: add sp, sp, #96 -; CHECK-FISEL-NEXT: ret -; -; CHECK-GISEL-LABEL: streaming_caller_nonstreaming_callee: -; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: sub sp, sp, #96 -; CHECK-GISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: smstop sm -; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: bl normal_callee -; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: smstart sm -; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 -; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: fadd d0, d1, d0 -; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: add sp, sp, #96 -; CHECK-GISEL-NEXT: ret +; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: bl normal_callee +; CHECK-COMMON-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-COMMON-NEXT: fmov d0, x8 +; CHECK-COMMON-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: fadd d0, d1, d0 +; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: ret entry: %call = call double @normal_callee(double %x) %add = fadd double %call, 4.200000e+01 @@ -358,64 +332,95 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw ret fp128 %res } -; FIXME: As above this should use Selection DAG to make sure the libcall call is lowered correctly. +; As above this should use Selection DAG to make sure the libcall call is lowered correctly. define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nounwind { -; CHECK-FISEL-LABEL: frem_call_za: -; CHECK-FISEL: // %bb.0: -; CHECK-FISEL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-FISEL-NEXT: mov x29, sp -; CHECK-FISEL-NEXT: sub sp, sp, #16 -; CHECK-FISEL-NEXT: rdsvl x8, #1 -; CHECK-FISEL-NEXT: mov x9, sp -; CHECK-FISEL-NEXT: mul x8, x8, x8 -; CHECK-FISEL-NEXT: sub x9, x9, x8 -; CHECK-FISEL-NEXT: mov sp, x9 -; CHECK-FISEL-NEXT: stur x9, [x29, #-16] -; CHECK-FISEL-NEXT: sub x9, x29, #16 -; CHECK-FISEL-NEXT: sturh w8, [x29, #-8] -; CHECK-FISEL-NEXT: msr TPIDR2_EL0, x9 -; CHECK-FISEL-NEXT: bl fmod -; CHECK-FISEL-NEXT: smstart za -; CHECK-FISEL-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-FISEL-NEXT: sub x0, x29, #16 -; CHECK-FISEL-NEXT: cbnz x8, .LBB10_2 -; CHECK-FISEL-NEXT: // %bb.1: -; CHECK-FISEL-NEXT: bl __arm_tpidr2_restore -; CHECK-FISEL-NEXT: .LBB10_2: -; CHECK-FISEL-NEXT: msr TPIDR2_EL0, xzr -; CHECK-FISEL-NEXT: mov sp, x29 -; CHECK-FISEL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ret -; -; CHECK-GISEL-LABEL: frem_call_za: -; CHECK-GISEL: // %bb.0: -; CHECK-GISEL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GISEL-NEXT: bl fmod -; CHECK-GISEL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GISEL-NEXT: ret +; CHECK-COMMON-LABEL: frem_call_za: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: mul x8, x8, x8 +; CHECK-COMMON-NEXT: sub x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: bl fmod +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB10_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB10_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = frem double %a, %b ret double %res } -; FIXME: As above this should use Selection DAG to make sure the libcall is lowered correctly. +; As above this should use Selection DAG to make sure the libcall is lowered correctly. define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind { ; CHECK-COMMON-LABEL: frem_call_sm: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp s0, s1, [sp, #72] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #72] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf -; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: str s0, [sp, #76] // 4-byte Folded Spill +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #76] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ret %res = frem float %a, %b ret float %res } -; FIXME: As above this should use Selection DAG to make sure the libcall is lowered correctly. +; As above this should use Selection DAG to make sure the libcall is lowered correctly. define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-COMMON-LABEL: frem_call_sm_compat: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl __arm_sme_state +; CHECK-COMMON-NEXT: and x19, x0, #0x1 +; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: .LBB12_2: +; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf -; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_4 +; CHECK-COMMON-NEXT: // %bb.3: +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: .LBB12_4: +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #96 ; CHECK-COMMON-NEXT: ret %res = frem float %a, %b ret float %res -- Gitee From 7346ab49c5c1fd40a289892075c4917029eddc97 Mon Sep 17 00:00:00 2001 From: "Manna, Soumi" Date: Wed, 23 Aug 2023 07:50:32 -0700 Subject: [PATCH 17/77] [NFC][CLANG] Fix static analyzer bugs about large copy by values Static Analyzer Tool complains about a large function call parameter which is is passed by value in CGBuiltin.cpp file. 1. In CodeGenFunction::EmitSMELdrStr(clang::SVETypeFlags, llvm::SmallVectorImpl &, unsigned int): We are passing parameter TypeFlags of type clang::SVETypeFlags by value. 2. In CodeGenFunction::EmitSMEZero(clang::SVETypeFlags, llvm::SmallVectorImpl &, unsigned int): We are passing parameter TypeFlags of type clang::SVETypeFlags by value. 3. In CodeGenFunction::EmitSMEReadWrite(clang::SVETypeFlags, llvm::SmallVectorImpl &, unsigned int): We are passing parameter TypeFlags of type clang::SVETypeFlags by value. 4. In CodeGenFunction::EmitSMELd1St1(clang::SVETypeFlags, llvm::SmallVectorImpl &, unsigned int): We are passing parameter TypeFlags of type clang::SVETypeFlags by value. I see many places in CGBuiltin.cpp file, we are passing parameter TypeFlags of type clang::SVETypeFlags by reference. clang::SVETypeFlags inherits several other types. This patch passes parameter TypeFlags by reference instead of by value in the function. Reviewed By: tahonermann, sdesmalen Differential Revision: https://reviews.llvm.org/D158522 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/CodeGen/CGBuiltin.cpp | 8 ++++---- clang/lib/CodeGen/CodeGenFunction.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 19e256e693c4..57ac75cba54f 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9461,7 +9461,7 @@ Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) { return Builder.CreateAdd(Base, CastOffset, "tileslice"); } -Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags, +Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { Ops[2] = EmitSVEPredicateCast( @@ -9491,7 +9491,7 @@ Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags, return Builder.CreateCall(F, NewOps); } -Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags, +Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { auto *VecTy = getSVEType(TypeFlags); @@ -9503,7 +9503,7 @@ Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags, return Builder.CreateCall(F, Ops); } -Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags, +Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { // svzero_za() intrinsic zeros the entire za tile and has no paramters. @@ -9513,7 +9513,7 @@ Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags, return Builder.CreateCall(F, Ops); } -Value *CodeGenFunction::EmitSMELdrStr(SVETypeFlags TypeFlags, +Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { if (Ops.size() == 3) { diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 143e0707b942..e44504ac0213 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4287,16 +4287,16 @@ public: unsigned IntID); llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); - llvm::Value *EmitSMELd1St1(SVETypeFlags TypeFlags, + llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); - llvm::Value *EmitSMEReadWrite(SVETypeFlags TypeFlags, + llvm::Value *EmitSMEReadWrite(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); - llvm::Value *EmitSMEZero(SVETypeFlags TypeFlags, + llvm::Value *EmitSMEZero(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); - llvm::Value *EmitSMELdrStr(SVETypeFlags TypeFlags, + llvm::Value *EmitSMELdrStr(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); -- Gitee From 05f1fdc9a7f9b38088c427318c76b15b98358822 Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Wed, 30 Aug 2023 08:54:46 +0000 Subject: [PATCH 18/77] [AArch64][SME] Promote mask for masked load to a similar type size with load value. The legalizer could keep an original mask type of masked load combined with sign/zero extend, but we have to extend the mask to a type similar to our combined load otherwise instruction selection could not lower the load. Differential Revision: https://reviews.llvm.org/D158386 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 10 +++- ...streaming-mode-fixed-length-masked-load.ll | 54 +++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ef04d9ceb72a..61d4a7e5c6f5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24933,7 +24933,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( EVT VT = Op.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG); + SDValue Mask = Load->getMask(); + // If this is an extending load and the mask type is not the same as + // load's type then we have to extend the mask type. + if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) { + assert(Load->getExtensionType() != ISD::NON_EXTLOAD && + "Incorrect mask type"); + Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask); + } + Mask = convertFixedMaskToScalableVector(Mask, DAG); SDValue PassThru; bool IsPassThruZeroOrUndef = false; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index e746770e29a2..a8301caf8695 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -335,6 +335,58 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { ret <4 x double> %load } +define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { +; CHECK-LABEL: masked_load_zext_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w2, [sp, #10] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: strh w1, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: lsl z0.h, z0.h, #15 +; CHECK-NEXT: asr z0.h, z0.h, #15 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) + %extend = zext <3 x i16> %load_value to <3 x i32> + ret <3 x i32> %extend; +} + +define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { +; CHECK-LABEL: masked_load_sext_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w2, [sp, #10] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: strh w1, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: lsl z0.h, z0.h, #15 +; CHECK-NEXT: asr z0.h, z0.h, #15 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) + %extend = sext <3 x i16> %load_value to <3 x i32> + ret <3 x i32> %extend; +} + declare <4 x i8> @llvm.masked.load.v4i8(ptr, i32, <4 x i1>, <4 x i8>) declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>) @@ -351,3 +403,5 @@ declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>) declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>) declare <4 x double> @llvm.masked.load.v4f64(ptr, i32, <4 x i1>, <4 x double>) + +declare <3 x i16> @llvm.masked.load.v3i16.p0(ptr, i32, <3 x i1>, <3 x i16>) -- Gitee From 5fb199e81ed68f8448c283ef029246c7659ebc47 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 30 Aug 2023 13:28:29 +0100 Subject: [PATCH 19/77] [AArch64][SME] NFC: Rename hasNewZAInterface to hasNewZABody. __arm_new_za is a declaration attribution, not a type attribute, and is therefore not part of the interface of a function. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/Target/AArch64/SMEABIPass.cpp | 2 +- llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp | 4 ++-- llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h | 4 ++-- llvm/unittests/Target/AArch64/SMEAttributesTest.cpp | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c60234fd85b5..06bb85768725 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -197,7 +197,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/true) || CallerAttrs.requiresLazySave(CalleeAttrs) || - CalleeAttrs.hasNewZAInterface()) + CalleeAttrs.hasNewZABody()) return false; const TargetMachine &TM = getTLI()->getTargetMachine(); diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 83010017c761..72e87a663fce 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -137,7 +137,7 @@ bool SMEABI::runOnFunction(Function &F) { bool Changed = false; SMEAttrs FnAttrs(F); - if (FnAttrs.hasNewZAInterface()) + if (FnAttrs.hasNewZABody()) Changed |= updateNewZAFunctions(M, &F, Builder); return Changed; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 0edb7cb98640..b2c126bbc6f3 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -20,9 +20,9 @@ void SMEAttrs::set(unsigned M, bool Enable) { assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) && "SM_Enabled and SM_Compatible are mutually exclusive"); - assert(!(hasNewZAInterface() && hasSharedZAInterface()) && + assert(!(hasNewZABody() && hasSharedZAInterface()) && "ZA_New and ZA_Shared are mutually exclusive"); - assert(!(hasNewZAInterface() && preservesZA()) && + assert(!(hasNewZABody() && preservesZA()) && "ZA_New and ZA_Preserved are mutually exclusive"); } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 1146fd4e3fa8..587765a7d63b 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -73,12 +73,12 @@ public: bool BodyOverridesInterface = false) const; // Interfaces to query PSTATE.ZA - bool hasNewZAInterface() const { return Bitmask & ZA_New; } + bool hasNewZABody() const { return Bitmask & ZA_New; } bool hasSharedZAInterface() const { return Bitmask & ZA_Shared; } bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); } bool preservesZA() const { return Bitmask & ZA_Preserved; } bool hasZAState() const { - return hasNewZAInterface() || hasSharedZAInterface(); + return hasNewZABody() || hasSharedZAInterface(); } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index 233c75a138ae..7780c71bbc00 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -44,7 +44,7 @@ TEST(SMEAttributes, Constructors) { ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_new\"") ->getFunction("foo")) - .hasNewZAInterface()); + .hasNewZABody()); ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_preserved\"") ->getFunction("foo")) @@ -87,12 +87,12 @@ TEST(SMEAttributes, Basics) { ASSERT_TRUE(SA(SA::ZA_Shared | SA::ZA_Preserved).preservesZA()); ASSERT_TRUE(SA(SA::ZA_New).hasPrivateZAInterface()); - ASSERT_TRUE(SA(SA::ZA_New).hasNewZAInterface()); + ASSERT_TRUE(SA(SA::ZA_New).hasNewZABody()); ASSERT_TRUE(SA(SA::ZA_New).hasZAState()); ASSERT_FALSE(SA(SA::ZA_New).preservesZA()); ASSERT_TRUE(SA(SA::Normal).hasPrivateZAInterface()); - ASSERT_FALSE(SA(SA::Normal).hasNewZAInterface()); + ASSERT_FALSE(SA(SA::Normal).hasNewZABody()); ASSERT_FALSE(SA(SA::Normal).hasZAState()); ASSERT_FALSE(SA(SA::Normal).preservesZA()); } -- Gitee From 433bf587ef3fce805617f205ee1088ac117ed0b2 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 4 Sep 2023 10:15:26 +0000 Subject: [PATCH 20/77] [AArch64][SME] Make the overloaded svreinterpret_* functions streaming-compatible. Otherwise these functions are not inlined when invoked from streaming functions. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D159188 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/Attr.td | 2 +- ...acle_sve_reinterpret_from_streaming_mode.c | 35 +++++++++++++++++++ clang/utils/TableGen/SveEmitter.cpp | 2 +- 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 845286606777..20d4c7262884 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -415,7 +415,7 @@ class TargetArch arches> : TargetSpec { let Arches = arches; } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; -def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAArch64 : TargetArch<["aarch64", "aarch64_be", "aarch64_32"]>; def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c new file mode 100644 index 000000000000..b430a0ba3cc7 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s + +// Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0. + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// Test that svreinterpret is inlined (because it should be streaming-compatible) +__attribute__((target("sme"))) +// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming { + return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op); +} + diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index dbf5122fdf22..f725c3954005 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1284,7 +1284,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) { if (ShortForm) { OS << "__aio __attribute__((target(\"sve\"))) " << From.Type << " svreinterpret_" << From.Suffix; - OS << "(" << To.Type << " op) {\n"; + OS << "(" << To.Type << " op) __arm_streaming_compatible {\n"; OS << " return __builtin_sve_reinterpret_" << From.Suffix << "_" << To.Suffix << "(op);\n"; OS << "}\n\n"; -- Gitee From e8629349882f1e6de37ddcc1cd5324dbd55836cb Mon Sep 17 00:00:00 2001 From: sdesmalen-arm Date: Mon, 4 Sep 2023 15:11:22 +0100 Subject: [PATCH 21/77] [AArch64][SME] Disable tail-call optimization for __arm_locally_streaming functions. (#65258) When calling a function which requires no streaming-mode change from an __arm_locally_streaming function, LLVM would otherwise emit: // function prologue smstart b streaming_compatible_function // tail call // never an smstop Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 ++- .../CodeGen/AArch64/sme-streaming-body.ll | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 61d4a7e5c6f5..35f66d7eb27c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6923,7 +6923,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( SMEAttrs CallerAttrs(MF.getFunction()); auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); if (CallerAttrs.requiresSMChange(CalleeAttrs) || - CallerAttrs.requiresLazySave(CalleeAttrs)) + CallerAttrs.requiresLazySave(CalleeAttrs) || + CallerAttrs.hasStreamingBody()) return false; // Functions using the C or Fast calling convention that have an SVE signature diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index c30e991d8f2f..5e4cdc46843d 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -308,3 +308,24 @@ for.cond.cleanup: ret float %add } + +define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: disable_tailcallopt: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + tail call void @streaming_compatible_callee(); + ret void; +} -- Gitee From f5eed6de7d66be4300a71f03682fe1e4ec43177e Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Fri, 15 Sep 2023 11:40:30 +0100 Subject: [PATCH 22/77] [AArch64][SME] Emit Zero instruction for NewZA functions [The ACLE](https://github.com/ARM-software/acle/pull/268) Demands that functions with the aarch64_pstate_za_new attribute set all bits of the ZA register to zero upon entry. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/SMEABIPass.cpp | 6 ++++++ llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll | 1 + llvm/test/CodeGen/AArch64/sme-new-za-function.ll | 2 ++ 3 files changed, 9 insertions(+) diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 72e87a663fce..c813d92ec85b 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -112,6 +112,12 @@ bool SMEABI::updateNewZAFunctions(Module *M, Function *F, Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + // ZA state must be zeroed upon entry to a function with NewZA + Function *ZeroIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); + Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, + Builder.getInt32(0xff)); + // Before returning, disable pstate.za for (BasicBlock &BB : *F) { Instruction *T = BB.getTerminator(); diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 0799b2a6edbe..7eaa0c3cf8b7 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -219,6 +219,7 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: b .LBB6_2 ; CHECK-COMMON-NEXT: .LBB6_2: // %entry ; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: zero {za} ; CHECK-COMMON-NEXT: bl za_shared_callee ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 ; CHECK-COMMON-NEXT: fmov d1, x8 diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll index 54ef5fd43275..0cee26dbb349 100644 --- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll +++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll @@ -15,6 +15,7 @@ define void @private_za() "aarch64_pstate_za_new" { ; CHECK-NEXT: br label [[TMP0]] ; CHECK: 0: ; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.zero(i32 255) ; CHECK-NEXT: call void @shared_za_callee() ; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() ; CHECK-NEXT: ret void @@ -35,6 +36,7 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_pstate_ ; CHECK-NEXT: br label [[ENTRY]] ; CHECK: entry: ; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.zero(i32 255) ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[COND:%.*]], 1 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]] ; CHECK: if.else: -- Gitee From fccf8eb813f0a8262cc632e07f49381455578ec2 Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Wed, 30 Aug 2023 11:10:45 +0000 Subject: [PATCH 23/77] [AArch64][SME] Enable TPIDR2 lazy-save for za_preserved This change makes callees with the __arm_preserves_za type attribute comply with the dormant state requirements when it's caller has the __arm_shared_za type attribute. Several external SME functions also do not need to lazy save. https://github.com/ARM-software/abi-aa/blob/5e67092434b50c04f8ad178a9c272ce3c6ada7fd/aapcs64/aapcs64.rst?plain=1#L1381 Differential Revision: https://reviews.llvm.org/D159186 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 98 +++++++++---------- .../AArch64/Utils/AArch64SMEAttributes.cpp | 18 +++- .../AArch64/Utils/AArch64SMEAttributes.h | 4 +- .../CodeGen/AArch64/sme-lazy-save-call.ll | 47 +++++++++ 4 files changed, 111 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 35f66d7eb27c..7adb50c33d32 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4784,17 +4784,6 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } -static std::optional getCalleeAttrsFromExternalFunction(SDValue V) { - if (auto *ES = dyn_cast(V)) { - StringRef S(ES->getSymbol()); - if (S == "__arm_sme_state" || S == "__arm_tpidr2_save") - return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved); - if (S == "__arm_tpidr2_restore") - return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared); - } - return std::nullopt; -} - SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -7260,28 +7249,31 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); if (CLI.CB) CalleeAttrs = SMEAttrs(*CLI.CB); - else if (std::optional Attrs = - getCalleeAttrsFromExternalFunction(CLI.Callee)) - CalleeAttrs = *Attrs; + else if (auto *ES = dyn_cast(CLI.Callee)) + CalleeAttrs = SMEAttrs(ES->getSymbol()); bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); - - MachineFrameInfo &MFI = MF.getFrameInfo(); if (RequiresLazySave) { - // Set up a lazy save mechanism by storing the runtime live slices - // (worst-case N*N) to the TPIDR2 stack object. - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); + SDValue NumZaSaveSlices; + if (!CalleeAttrs.preservesZA()) { + // Set up a lazy save mechanism by storing the runtime live slices + // (worst-case SVL*SVL) to the TPIDR2 stack object. + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + NumZaSaveSlices = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + } else if (CalleeAttrs.preservesZA()) { + NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64); + } + unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - SDValue BufferPtrAddr = + SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16); + Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, + MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), @@ -7388,6 +7380,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); + MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateStackObject(StoreSize, Alignment, false); if (isScalable) MFI.setStackID(FI, TargetStackID::ScalableVector); @@ -7704,35 +7697,34 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (RequiresLazySave) { - // Unconditionally resume ZA. - Result = DAG.getNode( - AArch64ISD::SMSTART, DL, MVT::Other, Result, - DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); - - // Conditionally restore the lazy save using a pseudo node. - unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, - DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, - RegMask, - Result.getValue(1)}); - + if (!CalleeAttrs.preservesZA()) { + // Unconditionally resume ZA. + Result = DAG.getNode( + AArch64ISD::SMSTART, DL, MVT::Other, Result, + DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + + // Conditionally restore the lazy save using a pseudo node. + unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask( + TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); + Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Result, TPIDR2_EL0, + DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Result.getValue(1)}); + } // Finally reset the TPIDR2_EL0 register to 0. Result = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Result, diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index b2c126bbc6f3..0082b4017986 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -24,12 +24,26 @@ void SMEAttrs::set(unsigned M, bool Enable) { "ZA_New and ZA_Shared are mutually exclusive"); assert(!(hasNewZABody() && preservesZA()) && "ZA_New and ZA_Preserved are mutually exclusive"); + assert(!(hasNewZABody() && (Bitmask & ZA_NoLazySave)) && + "ZA_New and ZA_NoLazySave are mutually exclusive"); + assert(!(hasSharedZAInterface() && (Bitmask & ZA_NoLazySave)) && + "ZA_Shared and ZA_NoLazySave are mutually exclusive"); } SMEAttrs::SMEAttrs(const CallBase &CB) { *this = SMEAttrs(CB.getAttributes()); - if (auto *F = CB.getCalledFunction()) - set(SMEAttrs(*F).Bitmask); + if (auto *F = CB.getCalledFunction()) { + set(SMEAttrs(*F).Bitmask | SMEAttrs(F->getName()).Bitmask); + } +} + +SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) { + if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") + Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved | + SMEAttrs::ZA_NoLazySave); + if (FuncName == "__arm_tpidr2_restore") + Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared | + SMEAttrs::ZA_NoLazySave); } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 587765a7d63b..e766b778b541 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -35,6 +35,7 @@ public: ZA_Shared = 1 << 3, // aarch64_pstate_sm_shared ZA_New = 1 << 4, // aarch64_pstate_sm_new ZA_Preserved = 1 << 5, // aarch64_pstate_sm_preserved + ZA_NoLazySave = 1 << 6, // Used for SME ABI routines to avoid lazy saves All = ZA_Preserved - 1 }; @@ -42,6 +43,7 @@ public: SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {} SMEAttrs(const CallBase &CB); SMEAttrs(const AttributeList &L); + SMEAttrs(StringRef FuncName); void set(unsigned M, bool Enable = true); @@ -82,7 +84,7 @@ public: } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && - !Callee.preservesZA(); + !(Callee.Bitmask & ZA_NoLazySave); } }; diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 00c1c9d66c3e..9576c975a0c4 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s declare void @private_za_callee() +declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved" declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. @@ -165,3 +166,49 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z call void @private_za_callee() ret void } + + +; Test lazy-save mechanism for an aarch64_pstate_za_shared caller +; calling a callee with aarch64_pstate_za_preserved. +define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: za_shared_caller_za_preserved_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-80] +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: sturh wzr, [x29, #-72] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB4_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: bl private_za_preserved_callee +; CHECK-NEXT: tbz x19, #0, .LBB4_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @private_za_preserved_callee() + ret void +} -- Gitee From 10be0ccae2f2dd5ac23b27a15dfa623dd392cec8 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Thu, 21 Sep 2023 19:53:16 +0100 Subject: [PATCH 24/77] [AArch64] Separate PNR into its own Register Class (#65306) This patch separates PNR registers into their own register class instead of sharing a register class with PPR registers. This primarily allows us to return more accurate register classes when applying assembly constraints, but also more protection from supplying an incorrect predicate type to an invalid register operand. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 2 + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 52 ++++++ .../lib/Target/AArch64/AArch64RegisterInfo.td | 172 +++++++++++------- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 24 +-- .../AArch64/AsmParser/AArch64AsmParser.cpp | 43 +++-- .../Disassembler/AArch64Disassembler.cpp | 21 ++- .../MCTargetDesc/AArch64InstPrinter.cpp | 5 +- .../AArch64/MCTargetDesc/AArch64InstPrinter.h | 1 - .../MCTargetDesc/AArch64MCCodeEmitter.cpp | 11 ++ llvm/lib/Target/AArch64/SMEInstrFormats.td | 16 +- .../GlobalISel/irtranslator-inline-asm.ll | 23 ++- .../irtranslator-unwind-inline-asm.ll | 2 +- .../AArch64/GlobalISel/regbank-inlineasm.mir | 4 +- llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir | 44 +++++ llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 3 + .../callbr-asm-outputs-indirect-isel.ll | 26 +-- .../emit_fneg_with_non_register_operand.mir | 4 +- .../CodeGen/AArch64/peephole-insvigpr.mir | 3 +- llvm/test/CodeGen/AArch64/preserve.ll | 4 +- llvm/test/CodeGen/AArch64/spillfill-sve.mir | 45 +++++ llvm/test/MC/AArch64/SVE/cntp-diagnostics.s | 2 +- 21 files changed, 364 insertions(+), 143 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 76f1cc782b24..b090efe3bb34 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -970,6 +970,8 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, RegClass = &AArch64::ZPRRegClass; } else if (AArch64::PPRRegClass.contains(Reg)) { RegClass = &AArch64::PPRRegClass; + } else if (AArch64::PNRRegClass.contains(Reg)) { + RegClass = &AArch64::PNRRegClass; } else { RegClass = &AArch64::FPR128RegClass; AltName = AArch64::vreg; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 0691e07a639b..e12dfeca8e24 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3580,6 +3580,39 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (AArch64::PNRRegClass.contains(DestReg) && + AArch64::PPRRegClass.contains(SrcReg)) { + assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && + "Unexpected predicate-as-counter register."); + // Copy from pX to pnX is a no-op + if ((DestReg.id() - AArch64::PN0) == (SrcReg.id() - AArch64::P0)) + return; + MCRegister PPRDestReg = (DestReg - AArch64::PN0) + AArch64::P0; + BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addDef(DestReg, RegState::Implicit); + return; + } + + if (AArch64::PPRRegClass.contains(DestReg) && + AArch64::PNRRegClass.contains(SrcReg)) { + assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && + "Unexpected predicate-as-counter register."); + // Copy from pnX to pX is a no-op + if ((DestReg.id() - AArch64::P0) == (SrcReg.id() - AArch64::PN0)) + return; + MCRegister PNRDestReg = (DestReg - AArch64::P0) + AArch64::PN0; + MCRegister PPRSrcReg = (SrcReg - AArch64::PN0) + AArch64::P0; + BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) + .addReg(PPRSrcReg) + .addReg(PPRSrcReg) + .addReg(PPRSrcReg, getKillRegState(KillSrc)) + .addDef(PNRDestReg, RegState::Implicit); + return; + } + // Copy a Z register by ORRing with itself. if (AArch64::ZPRRegClass.contains(DestReg) && AArch64::ZPRRegClass.contains(SrcReg)) { @@ -3869,6 +3902,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); unsigned Opc = 0; bool Offset = true; + MCRegister PNRReg = MCRegister::NoRegister; unsigned StackID = TargetStackID::Default; switch (TRI->getSpillSize(*RC)) { case 1: @@ -3882,6 +3916,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_PXI; StackID = TargetStackID::ScalableVector; + } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { + assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && + "Unexpected register store without SVE2p1 or SME2"); + SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0; + Opc = AArch64::STR_PXI; + StackID = TargetStackID::ScalableVector; } break; case 4: @@ -3982,6 +4022,8 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (Offset) MI.addImm(0); + if (PNRReg.isValid()) + MI.addDef(PNRReg, RegState::Implicit); MI.addMemOperand(MMO); } @@ -4026,6 +4068,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opc = 0; bool Offset = true; unsigned StackID = TargetStackID::Default; + MCRegister PNRReg = MCRegister::NoRegister; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) @@ -4038,6 +4081,13 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_PXI; StackID = TargetStackID::ScalableVector; + } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { + assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && + "Unexpected register load without SVE2p1 or SME2"); + PNRReg = DestReg; + DestReg = (DestReg - AArch64::PN0) + AArch64::P0; + Opc = AArch64::LDR_PXI; + StackID = TargetStackID::ScalableVector; } break; case 4: @@ -4138,6 +4188,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI); if (Offset) MI.addImm(0); + if (PNRReg.isValid()) + MI.addDef(PNRReg, RegState::Implicit); MI.addMemOperand(MMO); } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 4bb1f9413f2b..eeb09389a1b8 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -55,6 +55,8 @@ let Namespace = "AArch64" in { def zasubd1 : SubRegIndex<256>; // (16 x 16)/8 bytes = 256 bits def zasubq0 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits def zasubq1 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits + + def psub : SubRegIndex<16>; } let Namespace = "AArch64" in { @@ -767,23 +769,43 @@ def GPR64x8 : RegisterOperand { //===----- END: v8.7a accelerator extension register operands -------------===// +// SVE predicate-as-counter registers + def PN0 : AArch64Reg<0, "pn0">, DwarfRegNum<[48]>; + def PN1 : AArch64Reg<1, "pn1">, DwarfRegNum<[49]>; + def PN2 : AArch64Reg<2, "pn2">, DwarfRegNum<[50]>; + def PN3 : AArch64Reg<3, "pn3">, DwarfRegNum<[51]>; + def PN4 : AArch64Reg<4, "pn4">, DwarfRegNum<[52]>; + def PN5 : AArch64Reg<5, "pn5">, DwarfRegNum<[53]>; + def PN6 : AArch64Reg<6, "pn6">, DwarfRegNum<[54]>; + def PN7 : AArch64Reg<7, "pn7">, DwarfRegNum<[55]>; + def PN8 : AArch64Reg<8, "pn8">, DwarfRegNum<[56]>; + def PN9 : AArch64Reg<9, "pn9">, DwarfRegNum<[57]>; + def PN10 : AArch64Reg<10, "pn10">, DwarfRegNum<[58]>; + def PN11 : AArch64Reg<11, "pn11">, DwarfRegNum<[59]>; + def PN12 : AArch64Reg<12, "pn12">, DwarfRegNum<[60]>; + def PN13 : AArch64Reg<13, "pn13">, DwarfRegNum<[61]>; + def PN14 : AArch64Reg<14, "pn14">, DwarfRegNum<[62]>; + def PN15 : AArch64Reg<15, "pn15">, DwarfRegNum<[63]>; + // SVE predicate registers -def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>; -def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>; -def P2 : AArch64Reg<2, "p2">, DwarfRegNum<[50]>; -def P3 : AArch64Reg<3, "p3">, DwarfRegNum<[51]>; -def P4 : AArch64Reg<4, "p4">, DwarfRegNum<[52]>; -def P5 : AArch64Reg<5, "p5">, DwarfRegNum<[53]>; -def P6 : AArch64Reg<6, "p6">, DwarfRegNum<[54]>; -def P7 : AArch64Reg<7, "p7">, DwarfRegNum<[55]>; -def P8 : AArch64Reg<8, "p8">, DwarfRegNum<[56]>; -def P9 : AArch64Reg<9, "p9">, DwarfRegNum<[57]>; -def P10 : AArch64Reg<10, "p10">, DwarfRegNum<[58]>; -def P11 : AArch64Reg<11, "p11">, DwarfRegNum<[59]>; -def P12 : AArch64Reg<12, "p12">, DwarfRegNum<[60]>; -def P13 : AArch64Reg<13, "p13">, DwarfRegNum<[61]>; -def P14 : AArch64Reg<14, "p14">, DwarfRegNum<[62]>; -def P15 : AArch64Reg<15, "p15">, DwarfRegNum<[63]>; +let SubRegIndices = [psub] in { + def P0 : AArch64Reg<0, "p0", [PN0]>, DwarfRegAlias; + def P1 : AArch64Reg<1, "p1", [PN1]>, DwarfRegAlias; + def P2 : AArch64Reg<2, "p2", [PN2]>, DwarfRegAlias; + def P3 : AArch64Reg<3, "p3", [PN3]>, DwarfRegAlias; + def P4 : AArch64Reg<4, "p4", [PN4]>, DwarfRegAlias; + def P5 : AArch64Reg<5, "p5", [PN5]>, DwarfRegAlias; + def P6 : AArch64Reg<6, "p6", [PN6]>, DwarfRegAlias; + def P7 : AArch64Reg<7, "p7", [PN7]>, DwarfRegAlias; + def P8 : AArch64Reg<8, "p8", [PN8]>, DwarfRegAlias; + def P9 : AArch64Reg<9, "p9", [PN9]>, DwarfRegAlias; + def P10 : AArch64Reg<10, "p10", [PN10]>, DwarfRegAlias; + def P11 : AArch64Reg<11, "p11", [PN11]>, DwarfRegAlias; + def P12 : AArch64Reg<12, "p12", [PN12]>, DwarfRegAlias; + def P13 : AArch64Reg<13, "p13", [PN13]>, DwarfRegAlias; + def P14 : AArch64Reg<14, "p14", [PN14]>, DwarfRegAlias; + def P15 : AArch64Reg<15, "p15", [PN15]>, DwarfRegAlias; +} // The part of SVE registers that don't overlap Neon registers. // These are only used as part of clobber lists. @@ -881,8 +903,6 @@ class SVERegOp : SVERegOp {} class ZPRRegOp : SVERegOp {} @@ -891,7 +911,7 @@ class ZPRRegOp : RegisterClass< "AArch64", - [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1, aarch64svcount ], 16, + [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16, (sequence "P%u", firstreg, lastreg)> { let Size = 16; } @@ -909,69 +929,89 @@ class PPRAsmOperand : AsmOperandClass { let ParserMethod = "tryParseSVEPredicateVector"; } -def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", 0>; -def PPRAsmOp8 : PPRAsmOperand<"PredicateB", "PPR", 8>; -def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>; -def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>; -def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>; - -def PPRAny : PPRRegOp<"", PPRAsmOpAny, ElementSizeNone, PPR>; -def PPR8 : PPRRegOp<"b", PPRAsmOp8, ElementSizeB, PPR>; -def PPR16 : PPRRegOp<"h", PPRAsmOp16, ElementSizeH, PPR>; -def PPR32 : PPRRegOp<"s", PPRAsmOp32, ElementSizeS, PPR>; -def PPR64 : PPRRegOp<"d", PPRAsmOp64, ElementSizeD, PPR>; - +def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", 0>; +def PPRAsmOp8 : PPRAsmOperand<"PredicateB", "PPR", 8>; +def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>; +def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>; +def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>; def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", 0>; +class PPRRegOp : SVERegOp {} + +def PPRAny : PPRRegOp<"", PPRAsmOpAny, ElementSizeNone, PPR>; +def PPR8 : PPRRegOp<"b", PPRAsmOp8, ElementSizeB, PPR>; +def PPR16 : PPRRegOp<"h", PPRAsmOp16, ElementSizeH, PPR>; +def PPR32 : PPRRegOp<"s", PPRAsmOp32, ElementSizeS, PPR>; +def PPR64 : PPRRegOp<"d", PPRAsmOp64, ElementSizeD, PPR>; def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, ElementSizeNone, PPR_3b>; +class PNRClass : RegisterClass< + "AArch64", + [ aarch64svcount ], 16, + (sequence "PN%u", firstreg, lastreg)> { + let Size = 16; +} + +def PNR : PNRClass<0, 15>; +def PNR_p8to15 : PNRClass<8, 15>; // SVE predicate-as-counter operand -class PNRAsmOperand - : PPRAsmOperand { +class PNRAsmOperand: AsmOperandClass { + let Name = "SVE" # name # "Reg"; let PredicateMethod = "isSVEPredicateAsCounterRegOfWidth<" # Width # ", " # "AArch64::" # RegClass # "RegClassID>"; let DiagnosticType = "InvalidSVE" # name # "Reg"; + let RenderMethod = "addRegOperands"; let ParserMethod = "tryParseSVEPredicateVector"; } -class PNRRegOp - : PPRRegOp { - let PrintMethod = "printPredicateAsCounter<" # EltSize # ">"; +let RenderMethod = "addPNRasPPRRegOperands" in { + def PNRasPPROpAny : PNRAsmOperand<"PNRasPPRPredicateAny", "PNR", 0>; + def PNRasPPROp8 : PNRAsmOperand<"PNRasPPRPredicateB", "PNR", 8>; } -def PNRAsmOpAny: PNRAsmOperand<"PNPredicateAny", "PPR", 0>; -def PNRAsmOp8 : PNRAsmOperand<"PNPredicateB", "PPR", 8>; -def PNRAsmOp16 : PNRAsmOperand<"PNPredicateH", "PPR", 16>; -def PNRAsmOp32 : PNRAsmOperand<"PNPredicateS", "PPR", 32>; -def PNRAsmOp64 : PNRAsmOperand<"PNPredicateD", "PPR", 64>; - -def PNRAny : PNRRegOp<"", PNRAsmOpAny, 0, PPR>; -def PNR8 : PNRRegOp<"b", PNRAsmOp8, 8, PPR>; -def PNR16 : PNRRegOp<"h", PNRAsmOp16, 16, PPR>; -def PNR32 : PNRRegOp<"s", PNRAsmOp32, 32, PPR>; -def PNR64 : PNRRegOp<"d", PNRAsmOp64, 64, PPR>; - -class PNRP8to15RegOp - : PPRRegOp { - let PrintMethod = "printPredicateAsCounter<" # EltSize # ">"; - let EncoderMethod = "EncodePPR_p8to15"; - let DecoderMethod = "DecodePPR_p8to15RegisterClass"; -} - -def PNRAsmAny_p8to15 : PNRAsmOperand<"PNPredicateAny_p8to15", "PPR_p8to15", 0>; -def PNRAsmOp8_p8to15 : PNRAsmOperand<"PNPredicateB_p8to15", "PPR_p8to15", 8>; -def PNRAsmOp16_p8to15 : PNRAsmOperand<"PNPredicateH_p8to15", "PPR_p8to15", 16>; -def PNRAsmOp32_p8to15 : PNRAsmOperand<"PNPredicateS_p8to15", "PPR_p8to15", 32>; -def PNRAsmOp64_p8to15 : PNRAsmOperand<"PNPredicateD_p8to15", "PPR_p8to15", 64>; - -def PNRAny_p8to15 : PNRP8to15RegOp<"", PNRAsmAny_p8to15, 0, PPR_p8to15>; -def PNR8_p8to15 : PNRP8to15RegOp<"b", PNRAsmOp8_p8to15, 8, PPR_p8to15>; -def PNR16_p8to15 : PNRP8to15RegOp<"h", PNRAsmOp16_p8to15, 16, PPR_p8to15>; -def PNR32_p8to15 : PNRP8to15RegOp<"s", PNRAsmOp32_p8to15, 32, PPR_p8to15>; -def PNR64_p8to15 : PNRP8to15RegOp<"d", PNRAsmOp64_p8to15, 64, PPR_p8to15>; +class PNRasPPRRegOp : SVERegOp {} +def PNRasPPRAny : PNRasPPRRegOp<"", PNRasPPROpAny, ElementSizeNone, PPR>; +def PNRasPPR8 : PNRasPPRRegOp<"b", PNRasPPROp8, ElementSizeB, PPR>; + +def PNRAsmOpAny: PNRAsmOperand<"PNPredicateAny", "PNR", 0>; +def PNRAsmOp8 : PNRAsmOperand<"PNPredicateB", "PNR", 8>; +def PNRAsmOp16 : PNRAsmOperand<"PNPredicateH", "PNR", 16>; +def PNRAsmOp32 : PNRAsmOperand<"PNPredicateS", "PNR", 32>; +def PNRAsmOp64 : PNRAsmOperand<"PNPredicateD", "PNR", 64>; + +class PNRRegOp + : SVERegOp { + let PrintMethod = "printPredicateAsCounter<" # Size # ">"; +} +def PNRAny : PNRRegOp<"", PNRAsmOpAny, 0, PNR>; +def PNR8 : PNRRegOp<"b", PNRAsmOp8, 8, PNR>; +def PNR16 : PNRRegOp<"h", PNRAsmOp16, 16, PNR>; +def PNR32 : PNRRegOp<"s", PNRAsmOp32, 32, PNR>; +def PNR64 : PNRRegOp<"d", PNRAsmOp64, 64, PNR>; + +def PNRAsmAny_p8to15 : PNRAsmOperand<"PNPredicateAny_p8to15", "PNR_p8to15", 0>; +def PNRAsmOp8_p8to15 : PNRAsmOperand<"PNPredicateB_p8to15", "PNR_p8to15", 8>; +def PNRAsmOp16_p8to15 : PNRAsmOperand<"PNPredicateH_p8to15", "PNR_p8to15", 16>; +def PNRAsmOp32_p8to15 : PNRAsmOperand<"PNPredicateS_p8to15", "PNR_p8to15", 32>; +def PNRAsmOp64_p8to15 : PNRAsmOperand<"PNPredicateD_p8to15", "PNR_p8to15", 64>; + +class PNRP8to15RegOp + : SVERegOp { + let PrintMethod = "printPredicateAsCounter<" # Width # ">"; + let EncoderMethod = "EncodePNR_p8to15"; + let DecoderMethod = "DecodePNR_p8to15RegisterClass"; +} + +def PNRAny_p8to15 : PNRP8to15RegOp<"", PNRAsmAny_p8to15, 0, PNR_p8to15>; +def PNR8_p8to15 : PNRP8to15RegOp<"b", PNRAsmOp8_p8to15, 8, PNR_p8to15>; +def PNR16_p8to15 : PNRP8to15RegOp<"h", PNRAsmOp16_p8to15, 16, PNR_p8to15>; +def PNR32_p8to15 : PNRP8to15RegOp<"s", PNRAsmOp32_p8to15, 32, PNR_p8to15>; +def PNR64_p8to15 : PNRP8to15RegOp<"d", PNRAsmOp64_p8to15, 64, PNR_p8to15>; let Namespace = "AArch64" in { def psub0 : SubRegIndex<16, -1>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index b4f02e0dd203..307535f6e4ae 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2544,8 +2544,8 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>; def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; - def : Pat<(nxv16i1 (bitconvert (aarch64svcount PPR:$src))), (nxv16i1 PPR:$src)>; - def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PPR:$src)>; + def : Pat<(nxv16i1 (bitconvert (aarch64svcount PNR:$src))), (nxv16i1 PPR:$src)>; + def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PNR:$src)>; } // These allow casting from/to unpacked predicate types. @@ -3852,9 +3852,9 @@ defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>; multiclass store_pn_x2 { def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), - (aarch64svcount PPR:$PNg), GPR64:$base), + (aarch64svcount PNR:$PNg), GPR64:$base), (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1), - PPR:$PNg, GPR64:$base, (i64 0))>; + PNR:$PNg, GPR64:$base, (i64 0))>; } // Stores of 2 consecutive vectors @@ -3878,10 +3878,10 @@ defm : store_pn_x2; multiclass store_pn_x4 { def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3), - (aarch64svcount PPR:$PNg), GPR64:$base), + (aarch64svcount PNR:$PNg), GPR64:$base), (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1, Ty:$vec2, zsub2, Ty:$vec3, zsub3), - PPR:$PNg, GPR64:$base, (i64 0))>; + PNR:$PNg, GPR64:$base, (i64 0))>; } // Stores of 4 consecutive vectors @@ -3923,19 +3923,19 @@ defm WHILELS_CXX : sve2p1_int_while_rr_pn<"whilels", 0b111>; // Aliases for existing SVE instructions for which predicate-as-counter are // accepted as an operand to the instruction def : InstAlias<"ldr $Pt, [$Rn, $imm9, mul vl]", - (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>; + (LDR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>; def : InstAlias<"ldr $Pt, [$Rn]", - (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>; + (LDR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, 0), 0>; def : InstAlias<"str $Pt, [$Rn, $imm9, mul vl]", - (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>; + (STR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>; def : InstAlias<"str $Pt, [$Rn]", - (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>; + (STR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, 0), 0>; def : InstAlias<"mov $Pd, $Pn", - (ORR_PPzPP PNR8:$Pd, PNR8:$Pn, PNR8:$Pn, PNR8:$Pn), 0>; + (ORR_PPzPP PNRasPPR8:$Pd, PNRasPPR8:$Pn, PNRasPPR8:$Pn, PNRasPPR8:$Pn), 0>; -def : InstAlias<"pfalse\t$Pd", (PFALSE PNR8:$Pd), 0>; +def : InstAlias<"pfalse\t$Pd", (PFALSE PNRasPPR8:$Pd), 0>; } // End HasSVE2p1_or_HasSME2 diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f4b731db05b6..d9ff9fe23cd6 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1229,6 +1229,8 @@ public: case AArch64::PPRRegClassID: case AArch64::PPR_3bRegClassID: case AArch64::PPR_p8to15RegClassID: + case AArch64::PNRRegClassID: + case AArch64::PNR_p8to15RegClassID: RK = RegKind::SVEPredicateAsCounter; break; default: @@ -1249,6 +1251,9 @@ public: break; case AArch64::PPRRegClassID: case AArch64::PPR_3bRegClassID: + case AArch64::PPR_p8to15RegClassID: + case AArch64::PNRRegClassID: + case AArch64::PNR_p8to15RegClassID: RK = RegKind::SVEPredicateVector; break; default: @@ -1733,6 +1738,12 @@ public: Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base)); } + void addPNRasPPRRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand( + MCOperand::createReg((getReg() - AArch64::PN0) + AArch64::P0)); + } + void addVectorReg64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); assert( @@ -2697,22 +2708,22 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) { static unsigned matchSVEPredicateAsCounterRegName(StringRef Name) { return StringSwitch(Name.lower()) - .Case("pn0", AArch64::P0) - .Case("pn1", AArch64::P1) - .Case("pn2", AArch64::P2) - .Case("pn3", AArch64::P3) - .Case("pn4", AArch64::P4) - .Case("pn5", AArch64::P5) - .Case("pn6", AArch64::P6) - .Case("pn7", AArch64::P7) - .Case("pn8", AArch64::P8) - .Case("pn9", AArch64::P9) - .Case("pn10", AArch64::P10) - .Case("pn11", AArch64::P11) - .Case("pn12", AArch64::P12) - .Case("pn13", AArch64::P13) - .Case("pn14", AArch64::P14) - .Case("pn15", AArch64::P15) + .Case("pn0", AArch64::PN0) + .Case("pn1", AArch64::PN1) + .Case("pn2", AArch64::PN2) + .Case("pn3", AArch64::PN3) + .Case("pn4", AArch64::PN4) + .Case("pn5", AArch64::PN5) + .Case("pn6", AArch64::PN6) + .Case("pn7", AArch64::PN7) + .Case("pn8", AArch64::PN8) + .Case("pn9", AArch64::PN9) + .Case("pn10", AArch64::PN10) + .Case("pn11", AArch64::PN11) + .Case("pn12", AArch64::PN12) + .Case("pn13", AArch64::PN13) + .Case("pn14", AArch64::PN14) + .Case("pn15", AArch64::PN15) .Default(0); } diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index e50ac5c92d50..df817f62f99f 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -140,11 +140,14 @@ DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); +static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus -DecodePPR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, +DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -736,6 +739,18 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, return Success; } +static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const MCDisassembler *Decoder) { + if (RegNo > 15) + return Fail; + + unsigned Register = + AArch64MCRegisterClasses[AArch64::PNRRegClassID].getRegister(RegNo); + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const MCDisassembler *Decoder) { @@ -747,13 +762,13 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, } static DecodeStatus -DecodePPR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, +DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const MCDisassembler *Decoder) { if (RegNo > 7) return Fail; // Just reuse the PPR decode table - return DecodePPRRegisterClass(Inst, RegNo + 8, Addr, Decoder); + return DecodePNRRegisterClass(Inst, RegNo + 8, Addr, Decoder); } static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 2983e9a9be92..984ae06fb0b2 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1335,9 +1335,10 @@ void AArch64InstPrinter::printPredicateAsCounter(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Reg = MI->getOperand(OpNum).getReg(); + if (Reg < AArch64::PN0 || Reg > AArch64::PN15) + llvm_unreachable("Unsupported predicate-as-counter register"); + O << "pn" << Reg - AArch64::PN0; - assert(Reg <= AArch64::P15 && "Unsupported predicate register"); - O << "pn" << (Reg - AArch64::P0); switch (EltSize) { case 0: break; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index fcaa57402bc2..3e12df0f84af 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -191,7 +191,6 @@ protected: template void printPredicateAsCounter(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - template void printComplexRotationOp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 2dbbab13e8f3..d8070e21908a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -192,6 +192,9 @@ public: uint32_t EncodePPR_p8to15(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + uint32_t EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; uint32_t EncodeZPR2StridedRegisterClass(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, @@ -551,6 +554,14 @@ AArch64MCCodeEmitter::EncodePPR_p8to15(const MCInst &MI, unsigned OpIdx, return RegOpnd - AArch64::P8; } +uint32_t +AArch64MCCodeEmitter::EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + auto RegOpnd = MI.getOperand(OpIdx).getReg(); + return RegOpnd - AArch64::PN8; +} + uint32_t AArch64MCCodeEmitter::EncodeZPR2StridedRegisterClass( const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index b135fec03a8c..edd24b4a849b 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -1333,17 +1333,17 @@ multiclass sve2_int_perm_sel_p { } def : InstAlias(NAME # _B) PNRAny:$Pd, - PNRAny:$Pn, PPR8:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm), 0>; + (!cast(NAME # _B) PNRasPPRAny:$Pd, + PNRasPPRAny:$Pn, PPR8:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm), 0>; def : InstAlias(NAME # _H) PNRAny:$Pd, - PNRAny:$Pn, PPR16:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_7:$imm), 0>; + (!cast(NAME # _H) PNRasPPRAny:$Pd, + PNRasPPRAny:$Pn, PPR16:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_7:$imm), 0>; def : InstAlias(NAME # _S) PNRAny:$Pd, - PNRAny:$Pn, PPR32:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_3:$imm), 0>; + (!cast(NAME # _S) PNRasPPRAny:$Pd, + PNRasPPRAny:$Pn, PPR32:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_3:$imm), 0>; def : InstAlias(NAME # _D) PNRAny:$Pd, - PNRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>; + (!cast(NAME # _D) PNRasPPRAny:$Pd, + PNRasPPRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>; def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm), MatrixIndexGPR32Op12_15:$idx)), diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll index 8cdb0696f7ac..d531c29da755 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll @@ -26,7 +26,7 @@ define void @asm_simple_register_clobber() { define i64 @asm_register_early_clobber() { ; CHECK-LABEL: name: asm_register_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 2555915 /* regdef-ec:GPR64common */, def early-clobber %0, 2555915 /* regdef-ec:GPR64common */, def early-clobber %1, !0 + ; CHECK-NEXT: INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 2686987 /* regdef-ec:GPR64common */, def early-clobber %0, 2686987 /* regdef-ec:GPR64common */, def early-clobber %1, !0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]] @@ -54,7 +54,7 @@ entry: define i32 @test_single_register_output() nounwind ssp { ; CHECK-LABEL: name: test_single_register_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0 + ; CHECK-NEXT: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %0 ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 @@ -66,7 +66,7 @@ entry: define i64 @test_single_register_output_s64() nounwind ssp { ; CHECK-LABEL: name: test_single_register_output_s64 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"mov $0, 7", 0 /* attdialect */, 2555914 /* regdef:GPR64common */, def %0 + ; CHECK-NEXT: INLINEASM &"mov $0, 7", 0 /* attdialect */, 2686986 /* regdef:GPR64common */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY %0 ; CHECK-NEXT: $x0 = COPY [[COPY]](s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 @@ -79,7 +79,7 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 1507338 /* regdef:GPR32common */, def %1 + ; CHECK-NEXT: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 1638410 /* regdef:GPR32common */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] @@ -96,7 +96,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 2359306 /* regdef:FPR64 */, def %1 + ; CHECK-NEXT: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 2490378 /* regdef:FPR64 */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %1 ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64) @@ -120,13 +120,12 @@ entry: } define zeroext i8 @test_register_output_trunc(ptr %src) nounwind { - ; ; CHECK-LABEL: name: test_register_output_trunc ; CHECK: bb.1.entry: ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK-NEXT: INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1 + ; CHECK-NEXT: INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8) @@ -156,7 +155,7 @@ define void @test_input_register_imm() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY [[C]](s64) - ; CHECK-NEXT: INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, 2555913 /* reguse:GPR64common */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, 2686985 /* reguse:GPR64common */, [[COPY]] ; CHECK-NEXT: RET_ReallyLR call void asm sideeffect "mov x0, $0", "r"(i64 42) ret void @@ -191,7 +190,7 @@ define zeroext i8 @test_input_register(ptr %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]](p0) - ; CHECK-NEXT: INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1, 2555913 /* reguse:GPR64common */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1, 2686985 /* reguse:GPR64common */, [[COPY1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %1 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8) @@ -208,7 +207,7 @@ define i32 @test_memory_constraint(ptr %a) nounwind { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK-NEXT: INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, 1507338 /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0) + ; CHECK-NEXT: INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, 1638410 /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 @@ -222,7 +221,7 @@ define i16 @test_anyext_input() { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1507338 /* regdef:GPR32common */, def %0, 1507337 /* reguse:GPR32common */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1638410 /* regdef:GPR32common */, def %0, 1638409 /* reguse:GPR32common */, [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) @@ -238,7 +237,7 @@ define i16 @test_anyext_input_with_matching_constraint() { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1507338 /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1638410 /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll index bb7c9e274814..c224b0a259fc 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll @@ -71,7 +71,7 @@ define void @test2() #0 personality ptr @__gcc_personality_v0 { ; CHECK-NEXT: G_INVOKE_REGION_START ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY [[DEF]](p0) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2555913 /* reguse:GPR64common */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2686985 /* reguse:GPR64common */, [[COPY]] ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir index 28b56945c777..f75731e351c7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir @@ -57,7 +57,7 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_reg_output - ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1310730 /* regdef:FPR32 */, def %0 + ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1310730 /* regdef:PPR2_with_psub_in_PNR_p8to15_and_PPR2_with_psub1_in_PPR_3b */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 @@ -75,7 +75,7 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_mixed_types - ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1310730 /* regdef:FPR32 */, def %0, 2162698 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %1 + ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1310730 /* regdef:PPR2_with_psub_in_PNR_p8to15_and_PPR2_with_psub1_in_PPR_3b */, def %0, 2162698 /* regdef:WSeqPairsClass_with_subo32_in_GPR32common */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr(s64) = COPY %1 ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64) diff --git a/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir b/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir new file mode 100644 index 000000000000..5b1e24ea732f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir @@ -0,0 +1,44 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - %s -mtriple=aarch64 -verify-machineinstrs -run-pass=postrapseudos -mattr=+sme2 | FileCheck %s +--- +name: pnr_to_ppr +alignment: 4 +tracksRegLiveness: true +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: + hasRedZone: false +body: | + bb.0: + ; CHECK-LABEL: name: pnr_to_ppr + ; CHECK: renamable $pn8 = PTRUE_C_D + ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0 + ; CHECK-NEXT: RET_ReallyLR implicit killed $p0 + renamable $pn8 = PTRUE_C_D + $p0 = COPY killed renamable $pn8 + RET_ReallyLR implicit killed $p0 + +... +--- +name: ppr_to_pnr +alignment: 4 +tracksRegLiveness: true +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: + hasRedZone: false +body: | + bb.0: + ; CHECK-LABEL: name: ppr_to_pnr + ; CHECK: renamable $p8 = PTRUE_H 31 + ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0 + ; CHECK-NEXT: RET_ReallyLR implicit killed $pn0 + renamable $p8 = PTRUE_H 31 + $pn0 = COPY killed renamable $p8 + RET_ReallyLR implicit killed $pn0 + +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll index 2f1e2ad5e2fd..4ca2fb881579 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -79,3 +80,5 @@ define @test_svfadd_f16_Uph_constraint( %P %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"( %Pg, %Zn, %Zm) ret %1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll index cf22216daf51..ed02fdfc996a 100644 --- a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll +++ b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll @@ -18,7 +18,7 @@ define i32 @test0() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"# $0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"# $0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %5 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -31,7 +31,7 @@ define i32 @test0() { ; CHECK-NEXT: bb.2.direct: ; CHECK-NEXT: successors: %bb.4(0x80000000), %bb.3(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"# $0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3 + ; CHECK-NEXT: INLINEASM_BR &"# $0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY %7 ; CHECK-NEXT: B %bb.4 ; CHECK-NEXT: {{ $}} @@ -107,7 +107,7 @@ define i32 @dont_split1() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %1 ; CHECK-NEXT: B %bb.1 ; CHECK-NEXT: {{ $}} @@ -168,7 +168,7 @@ define i32 @dont_split3() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2 ; CHECK-NEXT: B %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.x: @@ -195,7 +195,7 @@ define i32 @split_me0() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -245,7 +245,7 @@ define i32 @split_me1(i1 %z) { ; CHECK-NEXT: bb.1.w: ; CHECK-NEXT: successors: %bb.3(0x80000000), %bb.2(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY %5 ; CHECK-NEXT: B %bb.3 ; CHECK-NEXT: {{ $}} @@ -298,7 +298,7 @@ define i32 @split_me2(i1 %z) { ; CHECK-NEXT: bb.1.w: ; CHECK-NEXT: successors: %bb.3(0x80000000), %bb.2(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY %6 ; CHECK-NEXT: B %bb.3 ; CHECK-NEXT: {{ $}} @@ -341,7 +341,7 @@ define i32 @dont_split4() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.1 ; CHECK-NEXT: {{ $}} @@ -380,7 +380,7 @@ define i32 @dont_split5() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -411,7 +411,7 @@ define i32 @split_me3() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -458,7 +458,7 @@ define i32 @dont_split6(i32 %0) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32all = PHI [[COPY]], %bb.0, %2, %bb.2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32common = COPY [[PHI]] - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY %4 ; CHECK-NEXT: B %bb.3 ; CHECK-NEXT: {{ $}} @@ -493,7 +493,7 @@ define i32 @split_me4() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -524,7 +524,7 @@ define i32 @split_me5() { ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 + ; CHECK-NEXT: INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32all = COPY %3 ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir index 6fe094cc6cbb..8255c7dd6f1e 100644 --- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir +++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir @@ -91,10 +91,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY %2 ; CHECK-NEXT: [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) ; CHECK-NEXT: [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2 ; CHECK-NEXT: nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr ; CHECK-NEXT: Bcc 1, %bb.2, implicit $nzcv diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir index 1c2fe27cdbc3..546f2ec0c417 100644 --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -26,7 +26,6 @@ ret void } - ; The optimization is not applicable when the source is not a virtual register define void @insert_vec_from_gpr(i32 %v, ptr %p) { entry: ret void @@ -488,7 +487,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]] - ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2359306 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] + ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %1, 262158 /* mem:m */, killed [[COPY1]] ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/preserve.ll b/llvm/test/CodeGen/AArch64/preserve.ll index 924e6f8487eb..f95de60fbb24 100644 --- a/llvm/test/CodeGen/AArch64/preserve.ll +++ b/llvm/test/CodeGen/AArch64/preserve.ll @@ -4,13 +4,13 @@ target triple = "aarch64-unknown-unknown" declare void @bar1() define preserve_mostcc void @baz() #0 { -; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 +; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 call void @bar1() call void @bar2() ret void } define preserve_allcc void @foo() #0 { -; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 +; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir index 951dbc72defc..08d55f07b7c4 100644 --- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir @@ -7,12 +7,14 @@ target triple = "aarch64--linux-gnu" define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable } + define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable } attributes #0 = { nounwind "target-features"="+sve" } + attributes #1 = { nounwind "target-features"="+sve2p1" } ... --- @@ -59,6 +61,49 @@ body: | RET_ReallyLR ... --- +name: spills_fills_stack_id_pnr +tracksRegLiveness: true +registers: + - { id: 0, class: pnr } +stack: +liveins: + - { reg: '$pn0', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $pn0 + + ; CHECK-LABEL: name: spills_fills_stack_id_pnr + ; CHECK: stack: + ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 + ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + + ; EXPAND-LABEL: name: spills_fills_stack_id_pnr + ; EXPAND: STR_PXI $p0, $sp, 7 + ; EXPAND: $p0 = LDR_PXI $sp, 7, implicit-def $pn0 + + %0:pnr = COPY $pn0 + + $pn0 = IMPLICIT_DEF + $pn1 = IMPLICIT_DEF + $pn2 = IMPLICIT_DEF + $pn3 = IMPLICIT_DEF + $pn4 = IMPLICIT_DEF + $pn5 = IMPLICIT_DEF + $pn6 = IMPLICIT_DEF + $pn7 = IMPLICIT_DEF + $pn8 = IMPLICIT_DEF + $pn9 = IMPLICIT_DEF + $pn10 = IMPLICIT_DEF + $pn11 = IMPLICIT_DEF + $pn12 = IMPLICIT_DEF + $pn13 = IMPLICIT_DEF + $pn14 = IMPLICIT_DEF + $pn15 = IMPLICIT_DEF + + $pn0 = COPY %0 + RET_ReallyLR +... +--- name: spills_fills_stack_id_zpr tracksRegLiveness: true registers: diff --git a/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s b/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s index 9c4173f50dee..99b38289ade7 100644 --- a/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s +++ b/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s @@ -1,7 +1,7 @@ // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve 2>&1 < %s| FileCheck %s cntp sp -// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction // CHECK-NEXT: cntp sp // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: -- Gitee From 6bc0ff60da5a69ec24f8aab233719bc7aeb63033 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Fri, 29 Sep 2023 10:33:25 +0100 Subject: [PATCH 25/77] [AArch64][SME] Use PNR Reg classes for predicate constraint (#67606) This patch fixes an error where ASM with constraints cannot select SME instructions which use the top eight predicate-as-counter registers. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 11 ++++--- llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7adb50c33d32..c4eda3a89252 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9949,18 +9949,21 @@ static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { - if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + if (VT != MVT::aarch64svcount && + (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)) return nullptr; switch (Constraint) { default: return nullptr; case PredicateConstraint::Uph: - return &AArch64::PPR_p8to15RegClass; + return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass + : &AArch64::PPR_p8to15RegClass; case PredicateConstraint::Upl: - return &AArch64::PPR_3bRegClass; + return VT == MVT::aarch64svcount ? nullptr : &AArch64::PPR_3bRegClass; case PredicateConstraint::Upa: - return &AArch64::PPRRegClass; + return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass + : &AArch64::PPRRegClass; } } diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll new file mode 100644 index 000000000000..98e7ad740681 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -stop-after=finalize-isel | FileCheck %s + +define dso_local void @UphPNR(target("aarch64.svcount") %predcnt) { +entry: +; CHECK: %0:ppr = COPY $p0 +; CHECK: STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2) +; CHECK: %1:pnr_p8to15 = COPY %0 +; CHECK: INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 393225 /* reguse:PNR_p8to15 */, %1 +; CHECK: RET_ReallyLR + %predcnt.addr = alloca target("aarch64.svcount"), align 2 + store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2 + %0 = load target("aarch64.svcount"), ptr %predcnt.addr, align 2 + call void asm sideeffect "ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", "@3Uph"(target("aarch64.svcount") %0) + ret void +} + +define dso_local void @UpaPNR(target("aarch64.svcount") %predcnt) { +entry: +; CHECK: %0:ppr = COPY $p0 +; CHECK: STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2) +; CHECK: %1:pnr = COPY %0 +; CHECK: INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 262153 /* reguse:PNR */, %1 +; CHECK: RET_ReallyLR + %predcnt.addr = alloca target("aarch64.svcount"), align 2 + store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2 + %0 = load target("aarch64.svcount"), ptr %predcnt.addr, align 2 + call void asm sideeffect "ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", "@3Upa"(target("aarch64.svcount") %0) + ret void +} \ No newline at end of file -- Gitee From c6c96ff6f4e581af6298b89e3fbf4bbfdb5554f4 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 6 Oct 2023 12:38:58 +0100 Subject: [PATCH 26/77] [AArch64][SME] Tile slices to lazy-save/restore should be RDSVL. (#68403) Instead of RDSVL * RDSVL. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 7 +++---- .../CodeGen/AArch64/sme-disable-gisel-fisel.ll | 9 +++------ llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll | 14 +++++--------- .../CodeGen/AArch64/sme-shared-za-interface.ll | 6 ++---- 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c4eda3a89252..a8fae41c0a5f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7257,10 +7257,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue NumZaSaveSlices; if (!CalleeAttrs.preservesZA()) { // Set up a lazy save mechanism by storing the runtime live slices - // (worst-case SVL*SVL) to the TPIDR2 stack object. - SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - NumZaSaveSlices = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + // (worst-case SVL) to the TPIDR2 stack object. + NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); } else if (CalleeAttrs.preservesZA()) { NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64); } diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 7eaa0c3cf8b7..4ee2962b7b21 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -241,9 +241,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: subs x9, x9, x8 +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] @@ -281,8 +280,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x9, x8 +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: sub x10, x29, #16 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] @@ -342,8 +340,7 @@ define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nou ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x9, x8 +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sub x9, x29, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 9576c975a0c4..9970f194936e 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -14,8 +14,7 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur x9, [x29, #-16] @@ -45,10 +44,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mul x19, x8, x8 +; CHECK-NEXT: rdsvl x19, #1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: sub x8, x8, x19 +; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: stur x8, [x29, #-16] @@ -92,8 +90,7 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur x9, [x29, #-16] @@ -129,8 +126,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #80 ; CHECK-NEXT: stur x9, [x29, #-80] diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 47d465cc320b..3a18294d8208 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -12,8 +12,7 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur x9, [x29, #-16] @@ -44,8 +43,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur x9, [x29, #-16] -- Gitee From 6d80456b3dd2ae1af95ad9fb21b0cdba6a614b72 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 6 Oct 2023 12:36:49 +0000 Subject: [PATCH 27/77] [AArch64][SME] NFC: use update_test_checks.py for sme-pstate(sm|za)-attrs.ll Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Inline/AArch64/sme-pstatesm-attrs.ll | 234 ++++++++++++++---- .../Inline/AArch64/sme-pstateza-attrs.ll | 85 +++++-- 2 files changed, 247 insertions(+), 72 deletions(-) diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll index 4e7074ec75dd..3df5400875ae 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll @@ -1,45 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s declare void @inlined_body() "aarch64_pstate_sm_compatible"; -; ; Define some functions that will be called by the functions below. ; These just call a '...body()' function. If we see the call to one of ; these functions being replaced by '...body()', then we know it has been ; inlined. -; define void @normal_callee() { +; CHECK-LABEL: define void @normal_callee +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @streaming_callee() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define void @streaming_callee +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @locally_streaming_callee() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define void @locally_streaming_callee +; CHECK-SAME: () #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define void @streaming_compatible_callee +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_callee +; CHECK-SAME: () #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } -; ; Now test that inlining only happens when their streaming modes match. ; Test for a number of combinations, where: ; N Normal-interface (PSTATE.SM=0 on entry/exit) @@ -58,8 +86,12 @@ entry: ; [ ] N -> N + B ; [ ] N -> SC + B define void @normal_caller_normal_callee_inline() { -; CHECK-LABEL: @normal_caller_normal_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @normal_caller_normal_callee_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @normal_callee() ret void @@ -71,8 +103,12 @@ entry: ; [ ] N -> N + B ; [ ] N -> SC + B define void @normal_caller_streaming_callee_dont_inline() { -; CHECK-LABEL: @normal_caller_streaming_callee_dont_inline( -; CHECK: call void @streaming_callee() +; CHECK-LABEL: define void @normal_caller_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @streaming_callee() ret void @@ -84,8 +120,12 @@ entry: ; [ ] N -> N + B ; [ ] N -> SC + B define void @normal_caller_streaming_compatible_callee_inline() { -; CHECK-LABEL: @normal_caller_streaming_compatible_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @normal_caller_streaming_compatible_callee_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_callee() ret void @@ -97,8 +137,12 @@ entry: ; [x] N -> N + B ; [ ] N -> SC + B define void @normal_caller_locally_streaming_callee_dont_inline() { -; CHECK-LABEL: @normal_caller_locally_streaming_callee_dont_inline( -; CHECK: call void @locally_streaming_callee() +; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @locally_streaming_callee() ret void @@ -110,8 +154,12 @@ entry: ; [ ] N -> N + B ; [x] N -> SC + B define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { -; CHECK-LABEL: @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline( -; CHECK: call void @streaming_compatible_locally_streaming_callee() +; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_locally_streaming_callee() ret void @@ -123,8 +171,12 @@ entry: ; [ ] S -> N + B ; [ ] S -> SC + B define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: @streaming_caller_normal_callee_dont_inline( -; CHECK: call void @normal_callee() +; CHECK-LABEL: define void @streaming_caller_normal_callee_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; entry: call void @normal_callee() ret void @@ -136,8 +188,12 @@ entry: ; [ ] S -> N + B ; [ ] S -> SC + B define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: @streaming_caller_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_caller_streaming_callee_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_callee() ret void @@ -149,8 +205,12 @@ entry: ; [ ] S -> N + B ; [ ] S -> SC + B define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: @streaming_caller_streaming_compatible_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_caller_streaming_compatible_callee_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_callee() ret void @@ -162,8 +222,12 @@ entry: ; [x] S -> N + B ; [ ] S -> SC + B define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: @streaming_caller_locally_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_caller_locally_streaming_callee_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @locally_streaming_callee() ret void @@ -175,8 +239,12 @@ entry: ; [ ] S -> N + B ; [x] S -> SC + B define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: @streaming_caller_streaming_compatible_locally_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_locally_streaming_callee() ret void @@ -188,8 +256,12 @@ entry: ; [ ] N + B -> N + B ; [ ] N + B -> SC + B define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: @locally_streaming_caller_normal_callee_dont_inline( -; CHECK: call void @normal_callee() +; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_dont_inline +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; entry: call void @normal_callee() ret void @@ -201,8 +273,12 @@ entry: ; [ ] N + B -> N + B ; [ ] N + B -> SC + B define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: @locally_streaming_caller_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @locally_streaming_caller_streaming_callee_inline +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_callee() ret void @@ -214,8 +290,12 @@ entry: ; [ ] N + B -> N + B ; [ ] N + B -> SC + B define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_callee_inline +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_callee() ret void @@ -227,8 +307,12 @@ entry: ; [x] N + B -> N + B ; [ ] N + B -> SC + B define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: @locally_streaming_caller_locally_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @locally_streaming_caller_locally_streaming_callee_inline +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @locally_streaming_callee() ret void @@ -240,8 +324,12 @@ entry: ; [ ] N + B -> N + B ; [x] N + B -> SC + B define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_locally_streaming_callee() ret void @@ -253,8 +341,12 @@ entry: ; [ ] SC -> N + B ; [ ] SC -> SC + B define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: @streaming_compatible_caller_normal_callee_dont_inline( -; CHECK: call void @normal_callee() +; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_dont_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; entry: call void @normal_callee() ret void @@ -266,8 +358,12 @@ entry: ; [ ] SC -> N + B ; [ ] SC -> SC + B define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: @streaming_compatible_caller_streaming_callee_dont_inline( -; CHECK: call void @streaming_callee() +; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @streaming_callee() ret void @@ -279,8 +375,12 @@ entry: ; [ ] SC -> N + B ; [ ] SC -> SC + B define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_callee_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_callee() ret void @@ -292,8 +392,12 @@ entry: ; [x] SC -> N + B ; [ ] SC -> SC + B define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: @streaming_compatible_caller_locally_streaming_callee_dont_inline( -; CHECK: call void @locally_streaming_callee() +; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @locally_streaming_callee() ret void @@ -305,8 +409,12 @@ entry: ; [ ] SC -> N + B ; [x] SC -> SC + B define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline( -; CHECK: call void @streaming_compatible_locally_streaming_callee() +; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_locally_streaming_callee() ret void @@ -317,8 +425,12 @@ entry: ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline( -; CHECK: call void @normal_callee() +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; entry: call void @normal_callee() ret void @@ -330,8 +442,12 @@ entry: ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_callee() ret void @@ -343,8 +459,12 @@ entry: ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_callee() ret void @@ -356,8 +476,12 @@ entry: ; [x] SC + B -> N + B ; [ ] SC + B -> SC + B define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @locally_streaming_callee() ret void @@ -369,8 +493,12 @@ entry: ; [ ] SC + B -> N + B ; [x] SC + B -> SC + B define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_and_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_and_callee_inline +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @streaming_compatible_locally_streaming_callee() ret void diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll index bdc9e637fe90..a833e7a911ac 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll @@ -1,32 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline < %s | FileCheck %s declare void @inlined_body() -; ; Define some functions that will be called by the functions below. ; These just call a '...body()' function. If we see the call to one of ; these functions being replaced by '...body()', then we know it has been ; inlined. -; define void @nonza_callee() { +; CHECK-LABEL: define void @nonza_callee +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @shared_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define void @shared_za_callee +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @inlined_body() ret void } define void @new_za_callee() "aarch64_pstate_za_new" { +; CHECK-LABEL: define void @new_za_callee +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; call void @inlined_body() ret void } -; ; Now test that inlining only happens when no lazy-save is needed. ; Test for a number of combinations, where: ; N Not using ZA. @@ -37,8 +52,12 @@ define void @new_za_callee() "aarch64_pstate_za_new" { ; [ ] N -> S (This combination is invalid) ; [ ] N -> Z define void @nonza_caller_nonza_callee_inline() { -; CHECK-LABEL: @nonza_caller_nonza_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @nonza_caller_nonza_callee_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @nonza_callee() ret void @@ -48,8 +67,12 @@ entry: ; [ ] N -> S (This combination is invalid) ; [x] N -> Z define void @nonza_caller_new_za_callee_dont_inline() { -; CHECK-LABEL: @nonza_caller_new_za_callee_dont_inline( -; CHECK: call void @new_za_callee() +; CHECK-LABEL: define void @nonza_caller_new_za_callee_dont_inline +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; entry: call void @new_za_callee() ret void @@ -59,8 +82,12 @@ entry: ; [ ] Z -> S ; [ ] Z -> Z define void @new_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_new" { -; CHECK-LABEL: @new_za_caller_nonza_callee_dont_inline( -; CHECK: call void @nonza_callee() +; CHECK-LABEL: define void @new_za_caller_nonza_callee_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @nonza_callee() +; CHECK-NEXT: ret void +; entry: call void @nonza_callee() ret void @@ -70,8 +97,12 @@ entry: ; [x] Z -> S ; [ ] Z -> Z define void @new_za_caller_shared_za_callee_inline() "aarch64_pstate_za_new" { -; CHECK-LABEL: @new_za_caller_shared_za_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @new_za_caller_shared_za_callee_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @shared_za_callee() ret void @@ -81,8 +112,12 @@ entry: ; [ ] Z -> S ; [x] Z -> Z define void @new_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_new" { -; CHECK-LABEL: @new_za_caller_new_za_callee_dont_inline( -; CHECK: call void @new_za_callee() +; CHECK-LABEL: define void @new_za_caller_new_za_callee_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; entry: call void @new_za_callee() ret void @@ -92,8 +127,12 @@ entry: ; [ ] Z -> S ; [ ] Z -> Z define void @shared_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_shared" { -; CHECK-LABEL: @shared_za_caller_nonza_callee_dont_inline( -; CHECK: call void @nonza_callee() +; CHECK-LABEL: define void @shared_za_caller_nonza_callee_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @nonza_callee() +; CHECK-NEXT: ret void +; entry: call void @nonza_callee() ret void @@ -103,8 +142,12 @@ entry: ; [x] S -> Z ; [ ] S -> S define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_shared" { -; CHECK-LABEL: @shared_za_caller_new_za_callee_dont_inline( -; CHECK: call void @new_za_callee() +; CHECK-LABEL: define void @shared_za_caller_new_za_callee_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; entry: call void @new_za_callee() ret void @@ -114,8 +157,12 @@ entry: ; [ ] S -> Z ; [x] S -> S define void @shared_za_caller_shared_za_callee_inline() "aarch64_pstate_za_shared" { -; CHECK-LABEL: @shared_za_caller_shared_za_callee_inline( -; CHECK: call void @inlined_body() +; CHECK-LABEL: define void @shared_za_caller_shared_za_callee_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_body() +; CHECK-NEXT: ret void +; entry: call void @shared_za_callee() ret void -- Gitee From f587c18cfc0146dbf14b0b7026a2e97a303fd417 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 6 Oct 2023 08:48:05 -0700 Subject: [PATCH 28/77] [AArch64][SME] Add remarks to flag lazy ZA saves, and SMSTART/SMSTOP transitions (#68255) Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 41 ++++++++- .../AArch64/sme-lazy-save-call-remarks.ll | 32 +++++++ .../sme-streaming-interface-remarks.ll | 90 +++++++++++++++++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a8fae41c0a5f..eef02d19fccb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -31,6 +31,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -7252,6 +7253,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, else if (auto *ES = dyn_cast(CLI.Callee)) CalleeAttrs = SMEAttrs(ES->getSymbol()); + auto DescribeCallsite = + [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { + R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '"; + if (auto *ES = dyn_cast(CLI.Callee)) + R << ore::NV("Callee", ES->getSymbol()); + else if (CLI.CB && CLI.CB->getCalledFunction()) + R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName()); + else + R << "unknown callee"; + R << "'"; + return R; + }; + bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { SDValue NumZaSaveSlices; @@ -7277,13 +7291,38 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), TPIDR2ObjAddr); + OptimizationRemarkEmitter ORE(&MF.getFunction()); + ORE.emit([&]() { + auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA", + CLI.CB) + : OptimizationRemarkAnalysis("sme", "SMELazySaveZA", + &MF.getFunction()); + DescribeCallsite(R) << " sets up a lazy save for ZA"; + if (CalleeAttrs.preservesZA()) + R << ", but callee preserves ZA, so we request 0 slices to be saved"; + else + R << ", and we request that all slices be saved"; + R << ore::setExtraArgs() + << ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA()); + return R; + }); } SDValue PStateSM; std::optional RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); - if (RequiresSMChange) + if (RequiresSMChange) { PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); + OptimizationRemarkEmitter ORE(&MF.getFunction()); + ORE.emit([&]() { + auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition", + CLI.CB) + : OptimizationRemarkAnalysis("sme", "SMETransition", + &MF.getFunction()); + DescribeCallsite(R) << " requires a streaming mode transition"; + return R; + }); + } // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll new file mode 100644 index 000000000000..6762a768fd5b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s + +declare void @private_za_callee() +declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved" +declare float @llvm.cos.f32(float) + +define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved + call void @private_za_callee() + ret void +} + +define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { +; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved + call void @private_za_callee() +; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved + call void @private_za_callee() + ret void +} + +define void @test_lazy_save_preserved_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK: remark: :0:0: call from 'test_lazy_save_preserved_callee' to 'private_za_preserved_callee' sets up a lazy save for ZA, but callee preserves ZA, so we request 0 slices to be saved + call void @private_za_preserved_callee() + ret void +} + +define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" { +; CHECK: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA, and we request that all slices be saved + %res = call float @llvm.cos.f32(float %a) + ret float %res +} diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll new file mode 100644 index 000000000000..e1a474d89823 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve -verify-machineinstrs --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s + +declare void @normal_callee() +declare void @streaming_callee() "aarch64_pstate_sm_enabled" +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + +; CHECK: remark: :0:0: call from 'normal_caller_streaming_callee' to 'streaming_callee' requires a streaming mode transition +define void @normal_caller_streaming_callee() nounwind { + call void @streaming_callee() + ret void; +} + +; CHECK: remark: :0:0: call from 'streaming_caller_normal_callee' to 'normal_callee' requires a streaming mode transition +define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" { + call void @normal_callee() + ret void; +} + +; CHECK-NOT: streaming_caller_streaming_callee +define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { + call void @streaming_callee() + ret void; +} + +; CHECK-NOT: streaming_caller_streaming_compatible_callee +define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" { + call void @streaming_compatible_callee() + ret void; +} + +; CHECK: remark: :0:0: call from 'call_to_function_pointer_streaming_enabled' to 'unknown callee' requires a streaming mode transition +define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind { + call void %p() "aarch64_pstate_sm_enabled" + ret void +} + +; CHECK: remark: :0:0: call from 'smstart_clobber_simdfp' to 'streaming_callee' requires a streaming mode transition +define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind { + call void @streaming_callee() + ret <4 x i32> %x; +} + +; CHECK: remark: :0:0: call from 'smstart_clobber_sve' to 'streaming_callee' requires a streaming mode transition +define @smstart_clobber_sve( %x) nounwind { + call void @streaming_callee() + ret %x; +} + +; CHECK: remark: :0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition +; CHECK: remark: :0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition +define @smstart_clobber_sve_duplicate( %x) nounwind { + call void @streaming_callee() + call void @streaming_callee() + ret %x; +} + +; CHECK: remark: :0:0: call from 'call_to_intrinsic_without_chain' to 'cos' requires a streaming mode transition +define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" { +entry: + %res = call fast double @llvm.cos.f64(double %x) + %res.fadd = fadd fast double %res, %x + ret double %res.fadd +} + +declare double @llvm.cos.f64(double) + +; CHECK: remark: :0:0: call from 'disable_tailcallopt' to 'streaming_callee' requires a streaming mode transition +define void @disable_tailcallopt() nounwind { + tail call void @streaming_callee() + ret void; +} + +; CHECK: remark: :0:0: call from 'call_to_non_streaming_pass_sve_objects' to 'foo' requires a streaming mode transition +define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 { +entry: + %Data1 = alloca , align 16 + %Data2 = alloca , align 16 + %Data3 = alloca , align 16 + %0 = tail call i64 @llvm.aarch64.sme.cntsb() + call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0) + %1 = load , ptr %Data1, align 16 + %vecext = extractelement %1, i64 0 + ret i8 %vecext +} + +declare i64 @llvm.aarch64.sme.cntsb() + +declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef) + +attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" } -- Gitee From db66576e66a2fb6cca87364c6bd14c3626b7b184 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Fri, 6 Oct 2023 13:30:28 -0700 Subject: [PATCH 29/77] [AArch64][SME] Fix generating incorrect TBZ when lowering lazy save. (#68429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After calling arm_sme_state, the -S assembly would show clang generating a “tbz xN, #0, Lbb”. However, disassembling it showed that it was actually encoded as “tbz xN, #32, Lbb”. The issue is that for TBZ, if you want a bit offset <32 you need to use the W variant, since the instruction overloads the top bit of the immediate. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 6 ++-- ...compatible-to-normal-fn-wihout-sme-attr.ll | 4 +-- .../AArch64/sme-disable-gisel-fisel.ll | 4 +-- .../CodeGen/AArch64/sme-lazy-save-call.ll | 8 ++--- .../sme-streaming-compatible-interface.ll | 30 +++++++++---------- 5 files changed, 27 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 0b12f5515a1e..8e1590733615 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1000,10 +1000,12 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB, // expected value for the callee (0 for a normal callee and 1 for a streaming // callee). auto PStateSM = MI.getOperand(2).getReg(); + auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + unsigned SMReg32 = TRI->getSubReg(PStateSM, AArch64::sub_32); bool IsStreamingCallee = MI.getOperand(3).getImm(); - unsigned Opc = IsStreamingCallee ? AArch64::TBZX : AArch64::TBNZX; + unsigned Opc = IsStreamingCallee ? AArch64::TBZW : AArch64::TBNZW; MachineInstrBuilder Tbx = - BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(PStateSM).addImm(0); + BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0); // Split MBB and create two new blocks: // - MBB now contains all instructions before MSRcond_pstatesvcrImm1. diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll index cffbadc53552..3fa1ee5b9b01 100644 --- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -17,12 +17,12 @@ define void @streaming_compatible() #0 { ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz x19, #0, .LBB0_2 +; CHECK-NEXT: tbz w19, #0, .LBB0_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: bl non_streaming -; CHECK-NEXT: tbz x19, #0, .LBB0_4 +; CHECK-NEXT: tbz w19, #0, .LBB0_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB0_4: diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 4ee2962b7b21..ba6097896622 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -401,14 +401,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: bl __arm_sme_state ; CHECK-COMMON-NEXT: and x19, x0, #0x1 -; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_2 +; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 ; CHECK-COMMON-NEXT: // %bb.1: ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: .LBB12_2: ; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf ; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill -; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_4 +; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_4 ; CHECK-COMMON-NEXT: // %bb.3: ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: .LBB12_4: diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 9970f194936e..db6ebc144534 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -134,12 +134,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz x19, #0, .LBB3_2 +; CHECK-NEXT: tbz w19, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: tbz x19, #0, .LBB3_4 +; CHECK-NEXT: tbz w19, #0, .LBB3_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_4: @@ -187,12 +187,12 @@ define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_ ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz x19, #0, .LBB4_2 +; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: bl private_za_preserved_callee -; CHECK-NEXT: tbz x19, #0, .LBB4_4 +; CHECK-NEXT: tbz w19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 214a5ce38f27..e247de284f6d 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -43,12 +43,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz x19, #0, .LBB1_2 +; CHECK-NEXT: tbz w19, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: bl normal_callee -; CHECK-NEXT: tbz x19, #0, .LBB1_4 +; CHECK-NEXT: tbz w19, #0, .LBB1_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB1_4: @@ -79,12 +79,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbnz x19, #0, .LBB2_2 +; CHECK-NEXT: tbnz w19, #0, .LBB2_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: tbnz x19, #0, .LBB2_4 +; CHECK-NEXT: tbnz w19, #0, .LBB2_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_4: @@ -134,7 +134,7 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz x19, #0, .LBB4_2 +; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: @@ -143,7 +143,7 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: bl normal_callee_vec_arg ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: tbz x19, #0, .LBB4_4 +; CHECK-NEXT: tbz w19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: @@ -204,14 +204,14 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( Date: Mon, 9 Oct 2023 09:27:45 +0100 Subject: [PATCH 30/77] [AArch64][SME] Zero reserved bytes when allocating a new TPIDR2 object (#68411) SME support routines expect that the reserved bytes of TPIDR2 (bytes 10-15) are zero. This patch ensures that the reserved bytes are cleared when allocating a new TPIDR2 block. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++++++ .../CodeGen/AArch64/sme-disable-gisel-fisel.ll | 12 ++++++++++-- llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll | 15 +++++++++++---- .../CodeGen/AArch64/sme-shared-za-interface.ll | 4 ++++ 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eef02d19fccb..790a36929a0c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6272,6 +6272,17 @@ AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); + // Set the reserved bytes (10-15) to zero + EVT PtrTy = Ptr.getValueType(); + SDValue ReservedPtr = + DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy)); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr, + MPI); + ReservedPtr = + DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy)); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr, + MPI); + return TPIDR2Obj; } diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index ba6097896622..e9be9c785a19 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -210,6 +210,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -245,6 +247,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 @@ -283,6 +287,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 @@ -342,10 +348,12 @@ define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nou ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index db6ebc144534..5db2a1914ed1 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -17,6 +17,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 @@ -49,6 +51,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { ; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh w19, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x20 @@ -93,6 +97,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 @@ -129,6 +135,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stur wzr, [x29, #-68] +; CHECK-NEXT: sturh wzr, [x29, #-70] ; CHECK-NEXT: stur x9, [x29, #-80] ; CHECK-NEXT: sturh w8, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 @@ -181,10 +189,9 @@ define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_ ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-80] -; CHECK-NEXT: sub x8, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: sub x9, x29, #80 +; CHECK-NEXT: stp x8, xzr, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB4_2 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 3a18294d8208..c9714b9ad848 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -15,6 +15,8 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind { ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 @@ -46,6 +48,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 -- Gitee From 62961353f7ca59bf0bc7f979ce46767351aebc99 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 12 Oct 2023 13:00:11 +0000 Subject: [PATCH 31/77] Reland "[compiler-rt][aarch64] Add SME ABI support routines." (#68875) Resolved issue with green dragon build by fixing relocations for MachO/Darwin which doesn't compile without @page/@pageoff directives. Also silenced a warning about constructor(90) priority being < 101, which is reserved for the implementation. In this case, we're compiling the implementation so we should be able to use 90. This reverts commit 072713add4408199d4bce7b3b02cc74a4a382ee0. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- compiler-rt/cmake/Modules/AddCompilerRT.cmake | 4 + compiler-rt/cmake/builtin-config-ix.cmake | 6 + compiler-rt/lib/builtins/CMakeLists.txt | 3 + .../lib/builtins/aarch64/sme-abi-init.c | 52 +++++ compiler-rt/lib/builtins/aarch64/sme-abi.S | 197 ++++++++++++++++++ 5 files changed, 262 insertions(+) create mode 100644 compiler-rt/lib/builtins/aarch64/sme-abi-init.c create mode 100644 compiler-rt/lib/builtins/aarch64/sme-abi.S diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake index a72e279dd75e..5ed49f0f5588 100644 --- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake +++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake @@ -312,6 +312,10 @@ function(add_compiler_rt_runtime name type) set(COMPONENT_OPTION COMPONENT ${libname}) endif() + if(type STREQUAL "SHARED") + list(APPEND LIB_DEFS COMPILER_RT_SHARED_LIB) + endif() + if(type STREQUAL "OBJECT") if(CMAKE_C_COMPILER_ID MATCHES Clang AND CMAKE_C_COMPILER_TARGET) list(APPEND extra_cflags_${libname} "--target=${CMAKE_C_COMPILER_TARGET}") diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index 9cf4877baf48..e91e3923a756 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -33,6 +33,12 @@ asm(\".arch armv8-a+lse\"); asm(\"cas w0, w1, [x2]\"); ") +builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME +" +asm(\".arch armv9-a+sme\"); +asm(\"smstart\"); +") + if(ANDROID) set(OS_NAME "Android") else() diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index d62fa0432e2a..156640958a41 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -553,6 +553,8 @@ set(aarch64_SOURCES ${GENERIC_SOURCES} cpu_model.c aarch64/fp_mode.c + aarch64/sme-abi.S + aarch64/sme-abi-init.c ) # Generate outline atomics helpers from lse.S base @@ -778,6 +780,7 @@ else () endif() append_list_if(COMPILER_RT_HAS_ASM_LSE HAS_ASM_LSE BUILTIN_DEFS) + append_list_if(COMPILER_RT_HAS_ASM_SME HAS_ASM_SME BUILTIN_DEFS) foreach (arch ${BUILTIN_SUPPORTED_ARCH}) if (CAN_TARGET_${arch}) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c new file mode 100644 index 000000000000..b6ee12170d56 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c @@ -0,0 +1,52 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +__attribute__((visibility("hidden"), nocommon)) +_Bool __aarch64_has_sme_and_tpidr2_el0; + +// We have multiple ways to check that the function has SME, depending on our +// target. +// * For Linux we can use __getauxval(). +// * For newlib we can use __aarch64_sme_accessible(). + +#if defined(__linux__) + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#ifndef HWCAP2_SME +#define HWCAP2_SME (1 << 23) +#endif + +extern unsigned long int __getauxval (unsigned long int); + +static _Bool has_sme(void) { + return __getauxval(AT_HWCAP2) & HWCAP2_SME; +} + +#else // defined(__linux__) + +#if defined(COMPILER_RT_SHARED_LIB) +__attribute__((weak)) +#endif +extern _Bool __aarch64_sme_accessible(void); + +static _Bool has_sme(void) { +#if defined(COMPILER_RT_SHARED_LIB) + if (!__aarch64_sme_accessible) + return 0; +#endif + return __aarch64_sme_accessible(); +} + +#endif // defined(__linux__) + +#if __GNUC__ >= 9 +#pragma GCC diagnostic ignored "-Wprio-ctor-dtor" +#endif +__attribute__((constructor(90))) +static void init_aarch64_has_sme(void) { + __aarch64_has_sme_and_tpidr2_el0 = has_sme(); +} diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S new file mode 100644 index 000000000000..207810b2e252 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -0,0 +1,197 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This patch implements the support routines for the SME ABI, +// described here: +// https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines + +#include "../assembly.h" + +#ifdef HAS_ASM_SME +#define ARCH armv9-a+sme +#define SMSTOP_SM smstop sm +#define SMSTOP_ZA smstop za +#define REG_TPIDR2_EL0 TPIDR2_EL0 +#define REG_SVCR SVCR +#define ADDSVL_X16_X16_1 addsvl x16, x16, #1 +#define LDR_ZA_W15_0_X16 ldr za[w15,0], [x16] +#define STR_ZA_W15_0_X16 str za[w15,0], [x16] +#define CNTD_X0 cntd x0 +#define CFI_OFFSET_VG_MINUS_16 .cfi_offset vg, -16 +#else +#define ARCH armv8-a +#define SMSTOP_SM .inst 0xd503427f +#define SMSTOP_ZA .inst 0xd503447f +#define REG_TPIDR2_EL0 S3_3_C13_C0_5 +#define REG_SVCR S3_3_C4_C2_2 +#define ADDSVL_X16_X16_1 .inst 0x04305830 +#define LDR_ZA_W15_0_X16 .inst 0xe1006200 +#define STR_ZA_W15_0_X16 .inst 0xe1206200 +#define CNTD_X0 .inst 0x04e0e3e0 +#define CFI_OFFSET_VG_MINUS_16 .cfi_escape 0x10, 0x2e, 0x03, 0x11, 0x70, 0x22 // $vg @ cfa - 16 +#endif + +#if !defined(__APPLE__) +#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) +#define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) +#else +// MachO requires @page/@pageoff directives because the global is defined +// in a different file. Otherwise this file may fail to build. +#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page +#define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff +#endif + +.arch ARCH + +// Utility function which calls a system's abort() routine. Because the function +// is streaming-compatible it should disable streaming-SVE mode before calling +// abort(). Note that there is no need to preserve any state before the call, +// because the function does not return. +DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) +.cfi_startproc + .variant_pcs SYMBOL_NAME(do_abort) + stp x29, x30, [sp, #-32]! + CNTD_X0 + // Store VG to a stack location that we describe with .cfi_offset + str x0, [sp, #16] + .cfi_def_cfa_offset 32 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + CFI_OFFSET_VG_MINUS_16 + bl __arm_sme_state + tbz x0, #0, 2f +1: + SMSTOP_SM +2: + // We can't make this into a tail-call because the unwinder would + // need to restore the value of VG. + bl SYMBOL_NAME(abort) +.cfi_endproc +END_COMPILERRT_FUNCTION(do_abort) + +// __arm_sme_state fills the result registers based on a local +// that is set as part of the compiler-rt startup code. +// __aarch64_has_sme_and_tpidr2_el0 +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) + .variant_pcs __arm_sme_state + mov x0, xzr + mov x1, xzr + + adrp x16, TPIDR2_SYMBOL + ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET] + cbz w16, 1f +0: + orr x0, x0, #0xC000000000000000 + mrs x16, REG_SVCR + bfxil x0, x16, #0, #2 + mrs x1, REG_TPIDR2_EL0 +1: + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) + .variant_pcs __arm_tpidr2_restore + // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific + // manner. + mrs x14, REG_TPIDR2_EL0 + cbnz x14, 2f + + // If any of the reserved bytes in the first 16 bytes of BLK are nonzero, + // the subroutine [..] aborts in some platform-defined manner. + ldrh w14, [x0, #10] + cbnz w14, 2f + ldr w14, [x0, #12] + cbnz w14, 2f + + // If BLK.za_save_buffer is NULL, the subroutine does nothing. + ldr x16, [x0] + cbz x16, 1f + + // If BLK.num_za_save_slices is zero, the subroutine does nothing. + ldrh w14, [x0, #8] + cbz x14, 1f + + mov x15, xzr +0: + LDR_ZA_W15_0_X16 + ADDSVL_X16_X16_1 + add x15, x15, #1 + cmp x14, x15 + b.ne 0b +1: + ret +2: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) + // If the current thread does not have access to TPIDR2_EL0, the subroutine + // does nothing. + adrp x14, TPIDR2_SYMBOL + ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET] + cbz w14, 1f + + // If TPIDR2_EL0 is null, the subroutine does nothing. + mrs x16, REG_TPIDR2_EL0 + cbz x16, 1f + + // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are + // nonzero, the subroutine [..] aborts in some platform-defined manner. + ldrh w14, [x16, #10] + cbnz w14, 2f + ldr w14, [x16, #12] + cbnz w14, 2f + + // If num_za_save_slices is zero, the subroutine does nothing. + ldrh w14, [x16, #8] + cbz x14, 1f + + // If za_save_buffer is NULL, the subroutine does nothing. + ldr x16, [x16] + cbz x16, 1f + + mov x15, xzr +0: + STR_ZA_W15_0_X16 + ADDSVL_X16_X16_1 + add x15, x15, #1 + cmp x14, x15 + b.ne 0b +1: + ret +2: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) + // If the current thread does not have access to SME, the subroutine does + // nothing. + adrp x14, TPIDR2_SYMBOL + ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET] + cbz w14, 0f + + // Otherwise, the subroutine behaves as if it did the following: + // * Call __arm_tpidr2_save. + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + bl __arm_tpidr2_save + + // * Set TPIDR2_EL0 to null. + msr REG_TPIDR2_EL0, xzr + + // * Set PSTATE.ZA to 0. + SMSTOP_ZA + + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 +0: + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable) -- Gitee From de1745544b29f1193c0d36089cd1f242c71de210 Mon Sep 17 00:00:00 2001 From: CarolineConcatto <51754594+CarolineConcatto@users.noreply.github.com> Date: Fri, 13 Oct 2023 14:25:42 +0100 Subject: [PATCH 32/77] =?UTF-8?q?[AArch64][NFC]=20Refactor=20NEON,=20SVE?= =?UTF-8?q?=20and=20SME=20classes=20and=20multiclasses=20fo=E2=80=A6=20(#6?= =?UTF-8?q?8800)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …r the assembly disassembly This NFC patch refactors the assembly/disassembly class and multiclass in the AArch64 backend to receive a new 2023/09 AArch64[1] ISA release. The encoding for the 2023 instructions re-uses encoding blocks from previous assembly/disassembly instructions. The refactoring makes the class and multiclass for assembly/disassembly generic so it can be used to describe the instructions for the new ISA. [1]https://developer.arm.com/documentation/ddi0602/2023-09 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../lib/Target/AArch64/AArch64InstrFormats.td | 36 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 262 +++++++------- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +- .../AArch64/AsmParser/AArch64AsmParser.cpp | 4 + .../MCTargetDesc/AArch64InstPrinter.cpp | 3 +- .../AArch64/MCTargetDesc/AArch64InstPrinter.h | 1 + llvm/lib/Target/AArch64/SMEInstrFormats.td | 333 ++++++++++-------- llvm/lib/Target/AArch64/SVEInstrFormats.td | 31 +- 9 files changed, 363 insertions(+), 317 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 39135df285c2..2d52b7c409b4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1485,7 +1485,7 @@ def UImm3s8Operand : UImmScaledMemoryIndexed<3, 8>; def uimm3s8 : Operand, ImmLeaf= 0 && Imm <= 56 && ((Imm % 8) == 0); }], UImmS8XForm> { - let PrintMethod = "printVectorIndex<8>"; + let PrintMethod = "printMatrixIndex<8>"; let ParserMatchClass = UImm3s8Operand; } @@ -5941,11 +5941,11 @@ multiclass SIMDLogicalThreeVectorTied size, // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract // bytes from S-sized elements. -class BaseSIMDThreeSameVectorDot sz, bits<4> opc, string asm, + string kind1, string kind2, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDThreeSameVectorTied { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, 0b10, {0b001, Mixed}, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, 0b10, {0b001, Mixed}, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } @@ -8412,12 +8412,12 @@ class SIMDThreeSameVectorMatMul size, string asm, +class BaseSIMDThreeSameVectorIndexS size, bits<4> opc, string asm, string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied size, str multiclass SIMDThreeSameVectorDotIndex size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", + def v8i8 : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", + def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed) let mayRaiseFPException = 1, Uses = [FPCR] in -class BaseSIMDThreeSameVectorFMLIndex opc, string asm, +class BaseSIMDThreeSameVectorIndexH sz, bits<4> opc, string asm, string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, - ValueType AccumType, ValueType InputType, - SDPatternOperator OpNode> : - BaseSIMDIndexedTied : + BaseSIMDIndexedTied opc, string asm, multiclass SIMDThreeSameVectorFMLIndex opc, string asm, SDPatternOperator OpNode> { - def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h", - V64, v2f32, v4f16, OpNode>; - def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h", - V128, v4f32, v8f16, OpNode>; + def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h", + V64, V128_lo, v2f32, v4f16, OpNode>; + def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h", + V128, V128_lo, v4f32, v8f16, OpNode>; } multiclass SIMDFPIndexed opc, string asm, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 27a7e26c5e18..8adceb9ffd15 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1198,7 +1198,7 @@ defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_ne class BaseSIMDSUDOTIndex - : BaseSIMDThreeSameVectorDotIndex { let Pattern = [(set (AccumType RegType:$dst), diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index f306021dd753..2685f2e3c810 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -66,8 +66,8 @@ let Predicates = [HasSME] in { defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b000, "bfmopa", int_aarch64_sme_mopa_wide>; defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b001, "bfmops", int_aarch64_sme_mops_wide>; -defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>; -defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>; +defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, 0b00, ZPR32, "fmopa", int_aarch64_sme_mopa>; +defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, 0b00, ZPR32, "fmops", int_aarch64_sme_mops>; } let Predicates = [HasSMEF64F64] in { @@ -216,29 +216,29 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 tim let Predicates = [HasSME2] in { defm ADD_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b0011010, MatrixOp32, ZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_add_write_single_za_vg1x2>; defm ADD_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"add", 0b0111010, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_add_write_single_za_vg1x4>; -defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x2>; -defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x4>; +defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b0110010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x2>; +defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b0110010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x4>; defm ADD_VG2_2ZZ : sme2_int_sve_destructive_vector_vg2_single<"add", 0b0110000>; defm ADD_VG4_4ZZ : sme2_int_sve_destructive_vector_vg4_single<"add", 0b0110000>; defm SUB_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"sub", 0b0011011, MatrixOp32, ZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_sub_write_single_za_vg1x2>; defm SUB_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"sub", 0b0111011, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_sub_write_single_za_vg1x4>; -defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x2>; -defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x4>; +defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b0110011, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x2>; +defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b0110011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x4>; defm FMLA_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x2>; defm FMLA_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x4>; -defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>; -defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>; -defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>; +defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0110000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>; +defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0110000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>; +defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b01, 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>; defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x4>; defm FMLS_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x2>; defm FMLS_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x4>; -defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>; -defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>; -defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>; +defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0110001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>; +defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0110001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>; +defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b01, 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>; defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x4>; defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_za32_vg1x2>; @@ -262,37 +262,37 @@ defm FMLAL_MZZI : sme2_mla_long_array_index<"fmlal", 0b10, 0b00, nxv8f16 defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x2>; defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x4>; defm FMLAL_MZZ : sme2_mla_long_array_single<"fmlal", 0b00, 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x1>; -defm FMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>; -defm FMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>; -defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x2>; -defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x4>; +defm FMLAL_VG2_M2ZZ_HtoS : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b000, MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>; +defm FMLAL_VG4_M4ZZ_HtoS : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b000, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>; +defm FMLAL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlal_vg2x2>; +defm FMLAL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlal_vg2x4>; defm FMLSL_MZZI : sme2_mla_long_array_index<"fmlsl", 0b10, 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x1>; defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x2>; defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x4>; defm FMLSL_MZZ : sme2_mla_long_array_single<"fmlsl", 0b00, 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x1>; -defm FMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>; -defm FMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>; -defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>; -defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>; +defm FMLSL_VG2_M2ZZ_HtoS : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b010, MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>; +defm FMLSL_VG4_M4ZZ_HtoS : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b010, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>; +defm FMLSL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b001, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>; +defm FMLSL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b001, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>; defm BFMLAL_MZZI : sme2_mla_long_array_index<"bfmlal", 0b10, 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x1>; defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x2>; defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x4>; defm BFMLAL_MZZ : sme2_mla_long_array_single<"bfmlal", 0b00, 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x1>; -defm BFMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>; -defm BFMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>; -defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>; -defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>; +defm BFMLAL_VG2_M2ZZ_HtoS : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b100, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>; +defm BFMLAL_VG4_M4ZZ_HtoS : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b100, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>; +defm BFMLAL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>; +defm BFMLAL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>; defm BFMLSL_MZZI : sme2_mla_long_array_index<"bfmlsl", 0b10, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x1>; defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x2>; defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x4>; defm BFMLSL_MZZ : sme2_mla_long_array_single<"bfmlsl", 0b00, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x1>; -defm BFMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>; -defm BFMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>; -defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>; -defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>; +defm BFMLSL_VG2_M2ZZ_HtoS : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b110, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>; +defm BFMLSL_VG4_M4ZZ_HtoS : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b110, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>; +defm BFMLSL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b011, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>; +defm BFMLSL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b011, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>; defm SMLAL_MZZI : sme2_mla_long_array_index<"smlal", 0b11, 0b00, nxv8i16, int_aarch64_sme_smlal_lane_vg2x1>; defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal", 0b00, int_aarch64_sme_smlal_lane_vg2x2>; @@ -413,122 +413,122 @@ defm SCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"sclamp", 0b0>; defm UCLAMP_VG2_2Z2Z : sme2_int_clamp_vector_vg2_multi<"uclamp", 0b1>; defm UCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"uclamp", 0b1>; -defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x2>; +defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b01, 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x2>; defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x4>; defm FDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg2_single<"fdot", 0b0010000, MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_single_za32_vg1x2>; defm FDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg4_single<"fdot", 0b0110000, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_single_za32_vg1x4>; -defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b010000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x2>; -defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b010000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x4>; +defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x2>; +defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x4>; -defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x2>; +defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b01, 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x2>; defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x4>; defm BFDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg2_single<"bfdot", 0b0010010, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_single_za32_vg1x2>; defm BFDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg4_single<"bfdot", 0b0110010, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_single_za32_vg1x4>; -defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b010010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x2>; -defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b010010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x4>; +defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b0100010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x2>; +defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b0100010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x4>; -defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fvdot_lane_za32_vg1x2>; +defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b01, 0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fvdot_lane_za32_vg1x2>; -defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fvdot_lane_za32_vg1x2>; +defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b01, 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fvdot_lane_za32_vg1x2>; -defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x2>; -defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x2>; +defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b01, 0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x2>; +defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b01, 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x2>; defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x4>; defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x4>; defm SDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b1010101, MatrixOp32, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za32_vg1x2>; defm SDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b1110101, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za32_vg1x4>; -defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x2>; -defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x4>; +defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b1101001, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x2>; +defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b1101001, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x4>; defm SDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b0010100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_single_za32_vg1x2>; defm SDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b0110100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_single_za32_vg1x4>; -defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x2>; -defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x4>; +defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b0101000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x2>; +defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b0101000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x4>; -defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x2>; +defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b01, 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x2>; defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x4>; defm SUDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg2_single<"sudot", 0b0010111, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_single_za32_vg1x2>; defm SUDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg4_single<"sudot", 0b0110111, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_single_za32_vg1x4>; -defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za32_vg1x2>; +defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b01, 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za32_vg1x2>; defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_svdot_lane_za32_vg1x4>; defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_suvdot_lane_za32_vg1x4>; -defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x2>; -defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x2>; +defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b01, 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x2>; +defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b01, 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x2>; defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x4>; defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x4>; defm UDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b1010111, MatrixOp32, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za32_vg1x2>; defm UDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b1110111, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za32_vg1x4>; -defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x2>; -defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x4>; +defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b1101011, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x2>; +defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b1101011, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x4>; defm UDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b0010110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_udot_single_za32_vg1x2>; defm UDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b0110110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_udot_single_za32_vg1x4>; -defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x2>; -defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x4>; +defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b0101010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x2>; +defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b0101010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x4>; -defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x2>; +defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b01, 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x2>; defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x4>; defm USDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg2_single<"usdot", 0b0010101, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_single_za32_vg1x2>; defm USDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg4_single<"usdot", 0b0110101, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_single_za32_vg1x4>; -defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x2>; -defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x4>; +defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b0101001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x2>; +defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b0101001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x4>; defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usvdot_lane_za32_vg1x4>; -defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za32_vg1x2>; +defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b01, 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za32_vg1x2>; defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_uvdot_lane_za32_vg1x4>; -defm SMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x1>; -defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x2>; -defm SMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x4>; -defm SMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"smlall", 0b0000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>; +defm SMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlall", 0b00, 0b000, int_aarch64_sme_smla_za32_lane_vg4x1>; +defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b00, 0b000, int_aarch64_sme_smla_za32_lane_vg4x2>; +defm SMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlall", 0b00, 0b0000, int_aarch64_sme_smla_za32_lane_vg4x4>; +defm SMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"smlall", 0b00000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>; defm SMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"smlall", 0b00000, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x2>; defm SMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"smlall", 0b01000, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x4>; -defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>; -defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>; +defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b00000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>; +defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b00000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>; -defm USMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"usmlall", 0b001, int_aarch64_sme_usmla_za32_lane_vg4x1>; -defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b100, int_aarch64_sme_usmla_za32_lane_vg4x2>; -defm USMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"usmlall", 0b100, int_aarch64_sme_usmla_za32_lane_vg4x4>; -defm USMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"usmlall", 0b0001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>; +defm USMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"usmlall", 0b00, 0b001, int_aarch64_sme_usmla_za32_lane_vg4x1>; +defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b00, 0b100, int_aarch64_sme_usmla_za32_lane_vg4x2>; +defm USMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"usmlall", 0b00, 0b0100, int_aarch64_sme_usmla_za32_lane_vg4x4>; +defm USMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"usmlall", 0b00001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>; defm USMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"usmlall", 0b00001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x2>; defm USMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"usmlall", 0b01001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x4>; -defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>; -defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>; +defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b00001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>; +defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b00001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>; -defm SMLSLL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlsll", 0b010, int_aarch64_sme_smls_za32_lane_vg4x1>; -defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b001, int_aarch64_sme_smls_za32_lane_vg4x2>; -defm SMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlsll", 0b001, int_aarch64_sme_smls_za32_lane_vg4x4>; -defm SMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"smlsll", 0b0010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>; +defm SMLSLL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlsll", 0b00, 0b010, int_aarch64_sme_smls_za32_lane_vg4x1>; +defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b00, 0b001, int_aarch64_sme_smls_za32_lane_vg4x2>; +defm SMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlsll", 0b00, 0b0001, int_aarch64_sme_smls_za32_lane_vg4x4>; +defm SMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"smlsll", 0b00010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>; defm SMLSLL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"smlsll", 0b00010, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x2>; defm SMLSLL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"smlsll", 0b01010, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x4>; -defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>; -defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>; +defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b00010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>; +defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b00010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>; -defm UMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"umlall", 0b100, int_aarch64_sme_umla_za32_lane_vg4x1>; -defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b010, int_aarch64_sme_umla_za32_lane_vg4x2>; -defm UMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlall", 0b010, int_aarch64_sme_umla_za32_lane_vg4x4>; -defm UMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"umlall", 0b0100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>; +defm UMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"umlall", 0b00, 0b100, int_aarch64_sme_umla_za32_lane_vg4x1>; +defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b00, 0b010, int_aarch64_sme_umla_za32_lane_vg4x2>; +defm UMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlall", 0b00, 0b0010, int_aarch64_sme_umla_za32_lane_vg4x4>; +defm UMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"umlall", 0b00100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>; defm UMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"umlall", 0b00100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x2>; defm UMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"umlall", 0b01100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x4>; -defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>; -defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>; +defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b00100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>; +defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b00100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>; -defm SUMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"sumlall", 0b101, int_aarch64_sme_sumla_za32_lane_vg4x1>; -defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b110, int_aarch64_sme_sumla_za32_lane_vg4x2>; -defm SUMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"sumlall", 0b110, int_aarch64_sme_sumla_za32_lane_vg4x4>; +defm SUMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"sumlall", 0b00, 0b101, int_aarch64_sme_sumla_za32_lane_vg4x1>; +defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b00, 0b110, int_aarch64_sme_sumla_za32_lane_vg4x2>; +defm SUMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"sumlall", 0b00, 0b0110, int_aarch64_sme_sumla_za32_lane_vg4x4>; defm SUMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"sumlall", 0b00101, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sumla_za32_single_vg4x2>; defm SUMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"sumlall", 0b01101, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sumla_za32_single_vg4x4>; -defm UMLSLL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"umlsll", 0b110, int_aarch64_sme_umls_za32_lane_vg4x1>; -defm UMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlsll", 0b011, int_aarch64_sme_umls_za32_lane_vg4x2>; -defm UMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlsll", 0b011, int_aarch64_sme_umls_za32_lane_vg4x4>; -defm UMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"umlsll", 0b0110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>; +defm UMLSLL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"umlsll", 0b00, 0b110, int_aarch64_sme_umls_za32_lane_vg4x1>; +defm UMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlsll", 0b00, 0b011, int_aarch64_sme_umls_za32_lane_vg4x2>; +defm UMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlsll", 0b00, 0b0011, int_aarch64_sme_umls_za32_lane_vg4x4>; +defm UMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"umlsll", 0b00110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>; defm UMLSLL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"umlsll", 0b00110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x2>; defm UMLSLL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"umlsll", 0b01110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x4>; -defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>; -defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>; +defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b00110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>; +defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b00110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>; defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>; defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>; @@ -674,13 +674,13 @@ defm STNT1D_4Z_STRIDED_IMM : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1 let Predicates = [HasSME2, HasSMEI16I64] in { defm ADD_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x2>; defm ADD_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"add", 0b1111010, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x4>; -defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x2>; -defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x4>; +defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b1110010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x2>; +defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b1110010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x4>; defm SUB_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"sub", 0b1011011, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_sub_write_single_za_vg1x2>; defm SUB_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"sub", 0b1111011, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_sub_write_single_za_vg1x4>; -defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x2>; -defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x4>; +defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b1110011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x2>; +defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b1110011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x4>; defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x2>; defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x4>; @@ -692,8 +692,8 @@ defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za64_vg1x4>; defm SDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b1010100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za64_vg1x2>; defm SDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b1110100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za64_vg1x4>; -defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x2>; -defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x4>; +defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b1101000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x2>; +defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b1101000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x4>; defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za64_vg1x4>; @@ -701,46 +701,46 @@ defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za64_vg1x4>; defm UDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b1010110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za64_vg1x2>; defm UDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b1110110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za64_vg1x4>; -defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x2>; -defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x4>; +defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b1101010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x2>; +defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b1101010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x4>; defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za64_vg1x4>; defm SMLALL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x1>; defm SMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x2>; defm SMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x4>; -defm SMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"smlall", 0b1000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>; +defm SMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"smlall", 0b10000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>; defm SMLALL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"smlall", 0b10000, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x2>; defm SMLALL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"smlall", 0b11000, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x4>; -defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall", 0b1000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>; -defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall", 0b1000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>; +defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall", 0b10000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>; +defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall", 0b10000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>; defm SMLSLL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x1>; defm SMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x2>; defm SMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x4>; -defm SMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"smlsll", 0b1010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>; +defm SMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"smlsll", 0b10010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>; defm SMLSLL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"smlsll", 0b10010, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x2>; defm SMLSLL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"smlsll", 0b11010, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x4>; -defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll", 0b1010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>; -defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll", 0b1010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>; +defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll", 0b10010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>; +defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll", 0b10010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>; defm UMLALL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x1>; defm UMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x2>; defm UMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x4>; -defm UMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"umlall", 0b1100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>; +defm UMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"umlall", 0b10100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>; defm UMLALL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"umlall", 0b10100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x2>; defm UMLALL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"umlall", 0b11100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x4>; -defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall", 0b1100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>; -defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall", 0b1100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>; +defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall", 0b10100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>; +defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall", 0b10100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>; defm UMLSLL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x1>; defm UMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x2>; defm UMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x4>; -defm UMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"umlsll", 0b1110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>; +defm UMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"umlsll", 0b10110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>; defm UMLSLL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"umlsll", 0b10110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x2>; defm UMLSLL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"umlsll", 0b11110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x4>; -defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll", 0b1110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>; -defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll", 0b1110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>; +defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll", 0b10110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>; +defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll", 0b10110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>; } let Predicates = [HasSME2, HasSMEF64F64] in { @@ -748,15 +748,15 @@ defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mu defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x4>; defm FMLA_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x2>; defm FMLA_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x4>; -defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>; -defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>; +defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b1110000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>; +defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b1110000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>; defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x2>; defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x4>; defm FMLS_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x2>; defm FMLS_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x4>; -defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>; -defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>; +defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b1110001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>; +defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b1110001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>; defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x2>; defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x4>; @@ -787,25 +787,25 @@ defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00>; -defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b00>; +defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>; +defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>; defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>; defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b01>; -defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b01>; +defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>; +defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>; defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>; defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>; defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>; -defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0>; -defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1>; +defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>; +defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>; } let Predicates = [HasSME2p1, HasB16B16] in { @@ -814,19 +814,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b10>; -defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b10>; +defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>; +defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>; defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>; defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b11>; -defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b11>; +defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>; +defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>; defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>; defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>; @@ -852,6 +852,6 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm", 0b0010011 defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">; defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">; -defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0>; -defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1>; +defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>; +defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 307535f6e4ae..741d228b3cad 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2204,8 +2204,8 @@ let Predicates = [HasSVEorSME] in { } // End HasSVEorSME let Predicates = [HasBF16, HasSVEorSME] in { - defm BFDOT_ZZZ : sve_float_dot<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>; - defm BFDOT_ZZI : sve_float_dot_indexed<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>; + defm BFDOT_ZZZ : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>; + defm BFDOT_ZZI : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>; } // End HasBF16, HasSVEorSME let Predicates = [HasBF16, HasSVE] in { @@ -3753,8 +3753,8 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; let Predicates = [HasSVE2p1_or_HasSME2] in { defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>; -defm FDOT_ZZZ_S : sve_float_dot<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; -defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; +defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; +defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; def BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb">; def BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt">; def BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb">; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d9ff9fe23cd6..139ad21f6ad3 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -4536,6 +4536,8 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) { // Check if register is followed by an index if (parseOptionalToken(AsmToken::LBrac)) { + Operands.push_back( + AArch64Operand::CreateToken("[", getLoc(), getContext())); const MCExpr *ImmVal; if (getParser().parseExpression(ImmVal)) return ParseStatus::NoMatch; @@ -4548,6 +4550,8 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) { Operands.push_back(AArch64Operand::CreateImm( MCConstantExpr::create(MCE->getValue(), getContext()), StartLoc, getLoc(), getContext())); + Operands.push_back( + AArch64Operand::CreateToken("]", getLoc(), getContext())); } return ParseStatus::Success; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 984ae06fb0b2..016e898bebf9 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1744,10 +1744,11 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, O << "[" << Scale * MI->getOperand(OpNum).getImm() << "]"; } +template void AArch64InstPrinter::printMatrixIndex(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - O << MI->getOperand(OpNum).getImm(); + O << Scale * MI->getOperand(OpNum).getImm(); } void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 3e12df0f84af..218ddfd918ec 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -172,6 +172,7 @@ protected: template void printVectorIndex(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + template void printMatrixIndex(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printAdrAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index edd24b4a849b..823115c7d025 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -230,7 +230,7 @@ def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; // SME Outer Products //===----------------------------------------------------------------------===// -class sme_fp_outer_product_inst sz, bit op, MatrixTileOperand za_ty, +class sme_fp_outer_product_inst sz, bits<2> op, MatrixTileOperand za_ty, ZPRRegOp zpr_ty, string mnemonic> : I<(outs za_ty:$ZAda), (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), @@ -242,7 +242,7 @@ class sme_fp_outer_product_inst sz, bit op, MatrixTileOperand za_ bits<3> Pn; bits<5> Zn; let Inst{31-25} = 0b1000000; - let Inst{24} = op; + let Inst{24} = op{1}; let Inst{23} = 0b1; let Inst{22-21} = sz; let Inst{20-16} = Zm; @@ -250,25 +250,25 @@ class sme_fp_outer_product_inst sz, bit op, MatrixTileOperand za_ let Inst{12-10} = Pn; let Inst{9-5} = Zn; let Inst{4} = S; - let Inst{3} = op; + let Inst{3} = op{0}; let Constraints = "$ZAda = $_ZAda"; } -multiclass sme_outer_product_fp32 { - def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { +multiclass sme_outer_product_fp32 sz, ZPRRegOp zpr_ty, string mnemonic, SDPatternOperator op> { + def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { bits<2> ZAda; let Inst{1-0} = ZAda; let Inst{2} = 0b0; } - def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme_outer_product_fp64 { - def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { + def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { bits<3> ZAda; let Inst{2-0} = ZAda; } @@ -278,8 +278,8 @@ multiclass sme_outer_product_fp64 def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } -multiclass sme2p1_fmop_tile_fp16{ - def NAME : sme_fp_outer_product_inst { +multiclass sme2p1_fmop_tile_fp16 op, ZPRRegOp zpr_ty>{ + def NAME : sme_fp_outer_product_inst { bits<1> ZAda; let Inst{2-1} = 0b00; let Inst{0} = ZAda; @@ -1449,7 +1449,7 @@ multiclass sme2_dot_mla_add_sub_array_vg4_single op, //===----------------------------------------------------------------------===// // SME2 multiple vectors ternary INT/FP two and four registers -class sme2_dot_mla_add_sub_array_vg2_multi op, +class sme2_dot_mla_add_sub_array_vg2_multi op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, string mnemonic> @@ -1463,20 +1463,19 @@ class sme2_dot_mla_add_sub_array_vg2_multi op, bits<2> Rv; bits<3> imm3; let Inst{31-23} = 0b110000011; - let Inst{22} = op{5}; //sz + let Inst{22} = op{6}; //sz let Inst{21} = 0b1; let Inst{20-17} = Zm; let Inst{16-15} = 0b00; let Inst{14-13} = Rv; - let Inst{12-10} = op{4-2}; + let Inst{12-10} = op{5-3}; let Inst{9-6} = Zn; - let Inst{5} = 0b0; - let Inst{4-3} = op{1-0}; + let Inst{5-3} = op{2-0}; let Inst{2-0} = imm3; let Constraints = "$ZAd = $_ZAd"; } -multiclass sme2_dot_mla_add_sub_array_vg2_multi op, +multiclass sme2_dot_mla_add_sub_array_vg2_multi op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ValueType zpr_ty, SDPatternOperator intrinsic> { @@ -1490,7 +1489,7 @@ multiclass sme2_dot_mla_add_sub_array_vg2_multi op, (!cast(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; } -class sme2_dot_mla_add_sub_array_vg4_multi op, +class sme2_dot_mla_add_sub_array_vg4_multi op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, string mnemonic> @@ -1504,20 +1503,20 @@ class sme2_dot_mla_add_sub_array_vg4_multi op, bits<2> Rv; bits<3> imm3; let Inst{31-23} = 0b110000011; - let Inst{22} = op{5}; //sz + let Inst{22} = op{6}; //sz let Inst{21} = 0b1; let Inst{20-18} = Zm; let Inst{17-15} = 0b010; let Inst{14-13} = Rv; - let Inst{12-10} = op{4-2}; + let Inst{12-10} = op{5-3}; let Inst{9-7} = Zn; - let Inst{6-5} = 0b00; - let Inst{4-3} = op{1-0}; + let Inst{6} = 0b0; + let Inst{5-3} = op{2-0}; let Inst{2-0} = imm3; let Constraints = "$ZAd = $_ZAd"; } -multiclass sme2_dot_mla_add_sub_array_vg4_multi op, +multiclass sme2_dot_mla_add_sub_array_vg4_multi op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ValueType zpr_ty, SDPatternOperator intrinsic>{ @@ -1794,8 +1793,8 @@ class sme2_mla_long_array_index_base op0, bits<2> op, Operand index_ty, } multiclass sme2_mla_long_array_index op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_index_base, SMEPseudo2Instr { + def _HtoS : sme2_mla_long_array_index_base, SMEPseudo2Instr { bits<3> i3; bits<5> Zn; bits<3> imm; @@ -1805,9 +1804,9 @@ multiclass sme2_mla_long_array_index op0, bits<2> op, V let Inst{2-0} = imm; } - def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo; - def : SME2_ZA_TwoOp_Multi_Index_Pat; + def : SME2_ZA_TwoOp_Multi_Index_Pat; } class sme2_mla_long_array_vg2_index op0, bits<2> op> @@ -1825,14 +1824,14 @@ class sme2_mla_long_array_vg2_index op0, bits<2> op> } multiclass sme2_fp_mla_long_array_vg2_index op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg2_index, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg2_index, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo; - def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } multiclass sme2_int_mla_long_array_vg2_index op, SDPatternOperator intrinsic> { @@ -1861,33 +1860,35 @@ class sme2_mla_long_array_vg4_index op0, bits<2> op> } multiclass sme2_fp_mla_long_array_vg4_index op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } multiclass sme2_int_mla_long_array_vg4_index op, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } -class sme2_mla_long_arrayop0, bits<2> op, Operand index_ty, +class sme2_mla_long_arrayop0, bits<2> op, + MatrixOperand matrix_ty, + Operand index_ty, RegisterOperand first_vector_ty, RegisterOperand second_vector_ty, string mnemonic, string vg_acronym=""> - : I<(outs MatrixOp32:$ZAda), - (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, + : I<(outs matrix_ty:$ZAda), + (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm, first_vector_ty:$Zn, second_vector_ty:$Zm), mnemonic,"\t$ZAda[$Rv, $imm" # !if(!eq(vg_acronym, ""), "", ", " # vg_acronym) # "], $Zn, $Zm", "", []> , Sched<[]> { @@ -1905,8 +1906,8 @@ class sme2_mla_long_arrayop0, bits<2> op, Operand index_ty, } multiclass sme2_mla_long_array_single op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array , SMEPseudo2Instr{ + def _HtoS : sme2_mla_long_array , SMEPseudo2Instr{ bits<4> Zm; bits<5> Zn; bits<3> imm; @@ -1916,15 +1917,15 @@ multiclass sme2_mla_long_array_single op0, bits<2> op, let Inst{2-0} = imm; } - def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo; - def : SME2_ZA_TwoOp_Multi_Single_Pat; + def : SME2_ZA_TwoOp_Multi_Single_Pat; } -class sme2_mla_long_array_vg24_single op0, bit vg4, bits<2> op, - RegisterOperand first_vector_ty, - string mnemonic, string vg_acronym> - : sme2_mla_long_array op0, bit vg4, bits<2> op, bit o2, + MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, + ZPRRegOp zpr_ty, string mnemonic, string vg_acronym> + : sme2_mla_long_array { bits<4> Zm; bits<5> Zn; @@ -1932,96 +1933,117 @@ class sme2_mla_long_array_vg24_single op0, bit vg4, bits<2> op, let Inst{20} = vg4; let Inst{19-16} = Zm; let Inst{9-5} = Zn; - let Inst{2} = 0b0; + let Inst{2} = o2; let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg2_single op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg24_single<0b00, 0b0, op, ZZ_h, mnemonic, - "vgx2">, SMEPseudo2Instr; + +multiclass sme2_fp_mla_long_array_vg2_single op, MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, + ValueType zpr_ty, SDPatternOperator intrinsic> { + def NAME : sme2_mla_long_array_vg24_single<0b00, 0b0, op{2-1}, op{0}, matrix_ty, multi_vector_ty, + vector_ty, mnemonic, "vgx2">, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; - def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>; + (!cast(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, multi_vector_ty:$Zn, vector_ty:$Zm), 0>; } multiclass sme2_int_mla_long_array_vg2_single op, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg24_single<0b01, 0b0, op, ZZ_h, mnemonic, - "vgx2">, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg24_single<0b01, 0b0, op, 0b0, MatrixOp32, ZZ_h, ZPR4b16, mnemonic, + "vgx2">, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo; - def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>; } -multiclass sme2_fp_mla_long_array_vg4_single op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg24_single<0b00, 0b1, op, ZZZZ_h, mnemonic, - "vgx4">, SMEPseudo2Instr; +multiclass sme2_fp_mla_long_array_vg4_single op, MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, + ValueType zpr_ty, SDPatternOperator intrinsic> { + def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty, + vector_ty, mnemonic, "vgx4">, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>; + (!cast(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, multi_vector_ty:$Zn, vector_ty:$Zm), 0>; } multiclass sme2_int_mla_long_array_vg4_single op, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg24_single<0b01, 0b1, op, ZZZZ_h, mnemonic, - "vgx4">, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg24_single<0b01, 0b1, op, 0b0, MatrixOp32, ZZZZ_h, ZPR4b16, mnemonic, + "vgx4">, SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>; } -class sme2_mla_long_array_vg2_multi op0, bits<2> op> - : sme2_mla_long_array { + +class sme2_mla_long_array_vg2_multi op0, bits<3> op, + MatrixOperand matrix_ty, RegisterOperand multi_vector_ty> + : sme2_mla_long_array { bits<4> Zm; bits<4> Zn; bits<2> imm; let Inst{20-17} = Zm; let Inst{16} = 0b0; let Inst{9-6} = Zn; - let Inst{5} = 0b0; + let Inst{5} = op{2}; // fp8 let Inst{2} = 0b0; let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg2_multi op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg2_multi, SMEPseudo2Instr; +multiclass sme2_fp_mla_long_array_vg2_multi op, MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, + ValueType zpr_ty, SDPatternOperator intrinsic> { + + def NAME : sme2_mla_long_array_vg2_multi, + SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; - def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>; + (!cast(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; } multiclass sme2_int_mla_long_array_vg2_multi op, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg2_multi, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg2_multi, + SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; - def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>; } -class sme2_mla_long_array_vg4_multi op0, bits<2> op> - : sme2_mla_long_array { +class sme2_mla_long_array_vg4_multi op0, bits<3> op, + MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty> + : sme2_mla_long_array { bits<3> Zm; bits<3> Zn; bits<2> imm; @@ -2029,31 +2051,37 @@ class sme2_mla_long_array_vg4_multi op0, bits<2> op> let Inst{17} = 0b0; let Inst{16} = 0b1; let Inst{9-7} = Zn; - let Inst{6-5} = 0b00; + let Inst{6} = 0b0; + let Inst{5} = op{2}; //fp8 let Inst{2} = 0b0; let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg4_multi op, ValueType zpr_ty, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg4_multi, SMEPseudo2Instr; +multiclass sme2_fp_mla_long_array_vg4_multi op, MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, ValueType zpr_ty, + SDPatternOperator intrinsic> { + def NAME : sme2_mla_long_array_vg4_multi, + SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>; + (!cast(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + uimm2s2range:$imm, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; } multiclass sme2_int_mla_long_array_vg4_multi op, SDPatternOperator intrinsic> { - def _S : sme2_mla_long_array_vg4_multi, SMEPseudo2Instr; + def _HtoS : sme2_mla_long_array_vg4_multi, + SMEPseudo2Instr; - def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + def _HtoS_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; - def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>; + (!cast(NAME #_HtoS) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>; } //===----------------------------------------------------------------------===// @@ -2344,7 +2372,7 @@ multiclass sme2_zip_vector_vg2 { //===----------------------------------------------------------------------===// // SME2 Dot Products and MLA -class sme2_multi_vec_array_vg2_index op, MatrixOperand matrix_ty, +class sme2_multi_vec_array_vg2_index sz, bits<6> op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, Operand index_ty, string mnemonic> @@ -2357,8 +2385,8 @@ class sme2_multi_vec_array_vg2_index op, MatrixOperand matrix_ty bits<2> Rv; bits<4> Zn; bits<3> imm3; - let Inst{31-23} = 0b110000010; - let Inst{22} = sz; + let Inst{31-24} = 0b11000001; + let Inst{23-22} = sz; let Inst{21-20} = 0b01; let Inst{19-16} = Zm; let Inst{15} = 0b0; @@ -2372,11 +2400,11 @@ class sme2_multi_vec_array_vg2_index op, MatrixOperand matrix_ty } // SME2 multi-vec ternary indexed two registers 32-bit -multiclass sme2_multi_vec_array_vg2_index_32b op, +multiclass sme2_multi_vec_array_vg2_index_32b sz, bits<4> op, RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, ValueType vt, SDPatternOperator intrinsic> { - def NAME : sme2_multi_vec_array_vg2_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty, + def NAME : sme2_multi_vec_array_vg2_index, SMEPseudo2Instr { bits<2> i; let Inst{11-10} = i; @@ -2392,9 +2420,10 @@ multiclass sme2_multi_vec_array_vg2_index_32b op, } // SME2.1 multi-vec ternary indexed two registers 16-bit -multiclass sme2p1_multi_vec_array_vg2_index_16b op> { - def NAME : sme2_multi_vec_array_vg2_index<0b0, {0b1,?,?,op,?}, MatrixOp16, - ZZ_h_mul_r, ZPR4b16, +multiclass sme2p1_multi_vec_array_vg2_index_16b sz, bits<3> op, + RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> { + def NAME : sme2_multi_vec_array_vg2_index { bits<3> i; let Inst{11-10} = i{2-1}; @@ -2402,7 +2431,7 @@ multiclass sme2p1_multi_vec_array_vg2_index_16b op> { } def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i), 0>; + multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>; } // SME2 multi-vec ternary indexed two registers 64-bit @@ -2451,7 +2480,7 @@ multiclass sme2_multi_vec_array_vg2_index_64b op, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>; } -class sme2_multi_vec_array_vg4_index op, MatrixOperand matrix_ty, +class sme2_multi_vec_array_vg4_index op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, Operand index_ty, string mnemonic> @@ -2470,10 +2499,9 @@ class sme2_multi_vec_array_vg4_index op, MatrixOperand matrix_ty let Inst{19-16} = Zm; let Inst{15} = 0b1; let Inst{14-13} = Rv; - let Inst{12-10} = op{5-3}; + let Inst{12-10} = op{6-4}; let Inst{9-7} = Zn; - let Inst{6} = 0b0; - let Inst{5-3} = op{2-0}; + let Inst{6-3} = op{3-0}; let Inst{2-0} = imm3; let Constraints = "$ZAda = $_ZAda"; @@ -2484,7 +2512,7 @@ multiclass sme2_multi_vec_array_vg4_index_32b op, RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, ValueType vt, SDPatternOperator intrinsic> { - def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, + def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,0b0, op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty, VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr { bits<2> i; let Inst{11-10} = i; @@ -2500,9 +2528,11 @@ multiclass sme2_multi_vec_array_vg4_index_32b op, } // SME2.1 multi-vec ternary indexed four registers 16-bit -multiclass sme2p1_multi_vec_array_vg4_index_16b op> { +multiclass sme2p1_multi_vec_array_vg4_index_16b op, + RegisterOperand multi_vector_ty, + ZPRRegOp zpr_ty> { def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16, - ZZZZ_h_mul_r, ZPR4b16, + multi_vector_ty, zpr_ty, VectorIndexH, mnemonic>{ bits<3> i; let Inst{11-10} = i{2-1}; @@ -2511,7 +2541,7 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b op> { def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, - sme_elm_idx0_7:$imm3, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i), 0>; + sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>; } // SME2 multi-vec ternary indexed four registers 64-bit @@ -2561,7 +2591,7 @@ multiclass sme2_multi_vec_array_vg4_index_64b op, } //===----------------------------------------------------------------------===// // SME2 multi-vec indexed long long MLA one source 32-bit -class sme2_mla_ll_array_index_32b op> +class sme2_mla_ll_array_index_32b sz, bits<3> op> : I<(outs MatrixOp32:$ZAda), (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s4range:$imm2, ZPR8:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i), mnemonic, "\t$ZAda[$Rv, $imm2], $Zn, $Zm$i", @@ -2571,7 +2601,9 @@ class sme2_mla_ll_array_index_32b op> bits<4> i; bits<5> Zn; bits<2> imm2; - let Inst{31-20} = 0b110000010000; + let Inst{31-24} = 0b11000001; + let Inst{23-22} = sz; + let Inst{21-20} = 0b00; let Inst{19-16} = Zm; let Inst{15} = i{3}; let Inst{14-13} = Rv; @@ -2583,8 +2615,8 @@ class sme2_mla_ll_array_index_32b op> let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_ll_array_index_32b op, SDPatternOperator intrinsic> { - def NAME : sme2_mla_ll_array_index_32b, SMEPseudo2Instr; +multiclass sme2_mla_ll_array_index_32b sz, bits<3> op, SDPatternOperator intrinsic> { + def NAME : sme2_mla_ll_array_index_32b, SMEPseudo2Instr; def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; @@ -2625,7 +2657,7 @@ multiclass sme2_mla_ll_array_index_64b op, SDPatternOpe def : SME2_ZA_TwoOp_Multi_Index_Pat; } -class sme2_mla_ll_array_vg24_index_32b op, +class sme2_mla_ll_array_vg24_index_32b sz, bit vg4, bits<3> op, RegisterOperand vector_ty, string mnemonic> : I<(outs MatrixOp32:$ZAda), @@ -2637,7 +2669,9 @@ class sme2_mla_ll_array_vg24_index_32b op, bits<2> Rv; bits<4> i; bit imm; - let Inst{31-20} = 0b110000010001; + let Inst{31-24} = 0b11000001; + let Inst{23-22} = sz; + let Inst{21-20} = 0b01; let Inst{19-16} = Zm; let Inst{15} = vg4; let Inst{14-13} = Rv; @@ -2652,8 +2686,8 @@ class sme2_mla_ll_array_vg24_index_32b op, //SME2 multi-vec indexed long long MLA two sources 32-bit -multiclass sme2_mla_ll_array_vg2_index_32b op, SDPatternOperator intrinsic> { - def NAME: sme2_mla_ll_array_vg24_index_32b<0b0, op, ZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { +multiclass sme2_mla_ll_array_vg2_index_32b sz, bits<3> op, SDPatternOperator intrinsic> { + def NAME: sme2_mla_ll_array_vg24_index_32b, SMEPseudo2Instr { bits<4> Zn; let Inst{9-6} = Zn; } @@ -2668,11 +2702,11 @@ multiclass sme2_mla_ll_array_vg2_index_32b op, SDPatter // SME2 multi-vec indexed long long MLA four sources 32-bit -multiclass sme2_mla_ll_array_vg4_index_32b op, SDPatternOperator intrinsic> { - def NAME: sme2_mla_ll_array_vg24_index_32b<0b1, op, ZZZZ_b_mul_r, mnemonic>, SMEPseudo2Instr { +multiclass sme2_mla_ll_array_vg4_index_32b sz, bits<4> op, SDPatternOperator intrinsic> { + def NAME: sme2_mla_ll_array_vg24_index_32b, SMEPseudo2Instr { bits<3> Zn; let Inst{9-7} = Zn; - let Inst{6} = 0b0; + let Inst{6} = op{3}; } def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; @@ -2744,7 +2778,7 @@ multiclass sme2_mla_ll_array_vg4_index_64b op, SDPatter //SME2 multiple and single vector long long FMA one source -class sme2_mla_ll_array_single op, +class sme2_mla_ll_array_single op, MatrixOperand matrix_ty, ZPRRegOp vector_ty, ZPRRegOp zpr_ty> : I<(outs matrix_ty:$ZAda), @@ -2757,8 +2791,9 @@ class sme2_mla_ll_array_single op, bits<5> Zn; bits<2> imm; let Inst{31-23} = 0b110000010; - let Inst{22} = op{3}; //sz - let Inst{21-20} = 0b10; + let Inst{22} = op{4}; //sz + let Inst{21} = 0b1; + let Inst{20} = op{3}; //fp8 let Inst{19-16} = Zm; let Inst{15} = 0b0; let Inst{14-13} = Rv; @@ -2770,7 +2805,7 @@ class sme2_mla_ll_array_single op, let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_ll_array_single op, +multiclass sme2_mla_ll_array_single op, MatrixOperand matrix_ty, ZPRRegOp vector_ty, ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> { def NAME : sme2_mla_ll_array_single, SMEPseudo2Instr; @@ -2780,29 +2815,28 @@ multiclass sme2_mla_ll_array_single op, def : SME2_ZA_TwoOp_Multi_Single_Pat; } -class sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, +class sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, RegisterOperand vector_ty, ZPRRegOp zpr_ty, string mnemonic> : I<(outs matrix_ty:$ZAda), (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, vector_ty:$Zn, zpr_ty:$Zm), - mnemonic, "\t$ZAda[$Rv, $imm, " # !if(op{3}, "vgx4", "vgx2") # "], $Zn, $Zm", + mnemonic, "\t$ZAda[$Rv, $imm, " # !if(op{4}, "vgx4", "vgx2") # "], $Zn, $Zm", "", []>, Sched<[]> { bits<4> Zm; bits<2> Rv; bits<5> Zn; bit imm; let Inst{31-23} = 0b110000010; - let Inst{22} = op{4}; //sz + let Inst{22} = op{5}; //sz let Inst{21} = 0b1; - let Inst{20} = op{3}; //vg4 + let Inst{20} = op{4}; //vg4 let Inst{19-16} = Zm; let Inst{15} = 0b0; let Inst{14-13} = Rv; let Inst{12-10} = 0b000; let Inst{9-5} = Zn; - let Inst{4-2} = op{2-0}; - let Inst{1} = 0b0; + let Inst{4-1} = op{3-0}; let Inst{0} = imm; let Constraints = "$ZAda = $_ZAda"; @@ -2810,7 +2844,7 @@ class sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, //SME2 single-multi long long MLA two and four sources -multiclass sme2_mla_ll_array_vg24_single op, +multiclass sme2_mla_ll_array_vg24_single op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> { @@ -2828,7 +2862,7 @@ multiclass sme2_mla_ll_array_vg2_single op, RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> { - defm NAME: sme2_mla_ll_array_vg24_single; + defm NAME: sme2_mla_ll_array_vg24_single; def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; } @@ -2837,14 +2871,14 @@ multiclass sme2_mla_ll_array_vg4_single op, MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> { - defm NAME: sme2_mla_ll_array_vg24_single; + defm NAME: sme2_mla_ll_array_vg24_single; def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; } // SME2 multiple vectors long long MLA two sources -class sme2_mla_ll_array_vg2_multi op, MatrixOperand matrix_ty, +class sme2_mla_ll_array_vg2_multi op, MatrixOperand matrix_ty, RegisterOperand vector_ty,string mnemonic> : I<(outs matrix_ty:$ZAda), (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, @@ -2856,22 +2890,21 @@ class sme2_mla_ll_array_vg2_multi op, MatrixOperand matrix_ty, bits<4> Zn; bit imm; let Inst{31-23} = 0b110000011; - let Inst{22} = op{3}; // sz + let Inst{22} = op{4}; // sz let Inst{21} = 0b1; let Inst{20-17} = Zm; let Inst{16-15} = 0b00; let Inst{14-13} = Rv; let Inst{12-10} = 0b000; let Inst{9-6} = Zn; - let Inst{5} = 0b0; - let Inst{4-2} = op{2-0}; + let Inst{5-2} = op{3-0}; let Inst{1} = 0b0; let Inst{0} = imm; let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_ll_array_vg2_multi op, +multiclass sme2_mla_ll_array_vg2_multi op, MatrixOperand matrix_ty, RegisterOperand vector_ty, ValueType vt, SDPatternOperator intrinsic> { @@ -2887,7 +2920,7 @@ multiclass sme2_mla_ll_array_vg2_multi op, // SME2 multiple vectors long long MLA four sources -class sme2_mla_ll_array_vg4_multi op,MatrixOperand matrix_ty, +class sme2_mla_ll_array_vg4_multi op,MatrixOperand matrix_ty, RegisterOperand vector_ty, string mnemonic> : I<(outs matrix_ty:$ZAda), @@ -2900,22 +2933,22 @@ class sme2_mla_ll_array_vg4_multi op,MatrixOperand matrix_ty, bits<3> Zn; bit imm; let Inst{31-23} = 0b110000011; - let Inst{22} = op{3}; // sz + let Inst{22} = op{4}; // sz let Inst{21} = 0b1; let Inst{20-18} = Zm; let Inst{17-15} = 0b010; let Inst{14-13} = Rv; let Inst{12-10} = 0b000; let Inst{9-7} = Zn; - let Inst{6-5} = 0b00; - let Inst{4-2} = op{2-0}; + let Inst{6} = 0b0; + let Inst{5-2} = op{3-0}; let Inst{1} = 0b0; let Inst{0} = imm; let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_ll_array_vg4_multi op, +multiclass sme2_mla_ll_array_vg4_multi op, MatrixOperand matrix_ty, RegisterOperand vector_ty, ValueType vt, SDPatternOperator intrinsic> { @@ -2985,7 +3018,7 @@ class sme2_spill_fill_vector opc> // SME2 move to/from lookup table class sme2_movt_zt_to_scalar opc> : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, uimm3s8:$imm3), - mnemonic, "\t$Rt, $ZTt$imm3", + mnemonic, "\t$Rt, $ZTt[$imm3]", "", []>, Sched<[]> { bits<3> imm3; bits<5> Rt; @@ -2997,7 +3030,7 @@ class sme2_movt_zt_to_scalar opc> class sme2_movt_scalar_to_zt opc> : I<(outs ZTR:$ZTt), (ins uimm3s8:$imm3, GPR64:$Rt), - mnemonic, "\t$ZTt$imm3, $Rt", + mnemonic, "\t$ZTt[$imm3], $Rt", "", []>, Sched<[]> { bits<3> imm3; bits<5> Rt; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 4902ec3639ec..a35a3e4f40c5 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -8721,8 +8721,8 @@ multiclass sve2_crypto_unary_op { // SVE BFloat16 Group //===----------------------------------------------------------------------===// -class sve_float_dot -: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), +class sve_float_dot +: I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, src_ty:$Zn, src_ty:$Zm), asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; @@ -8731,7 +8731,8 @@ class sve_float_dot let Inst{22} = bf; let Inst{21} = 0b1; let Inst{20-16} = Zm; - let Inst{15-10} = 0b100000; + let Inst{15-11} = 0b10000; + let Inst{10} = o2; let Inst{9-5} = Zn; let Inst{4-0} = Zda; @@ -8741,24 +8742,24 @@ class sve_float_dot let mayRaiseFPException = 1; } -multiclass sve_float_dot { - def NAME : sve_float_dot; +multiclass sve_float_dot { + def NAME : sve_float_dot; def : SVE_3_Op_Pat(NAME)>; } -class sve_float_dot_indexed -: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS32b:$iop), +class sve_float_dot_indexed +: I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, src1_ty:$Zn, src2_ty:$Zm, iop_ty:$iop), asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; bits<3> Zm; - bits<2> iop; let Inst{31-23} = 0b011001000; let Inst{22} = bf; let Inst{21} = 0b1; - let Inst{20-19} = iop; let Inst{18-16} = Zm; - let Inst{15-10} = 0b010000; + let Inst{15-12} = 0b0100; let Inst{9-5} = Zn; let Inst{4-0} = Zda; @@ -8768,8 +8769,14 @@ class sve_float_dot_indexed let mayRaiseFPException = 1; } -multiclass sve_float_dot_indexed { - def NAME : sve_float_dot_indexed; +multiclass sve_float_dot_indexed opc, ZPRRegOp src1_ty, + ZPRRegOp src2_ty, string asm, ValueType InVT, + SDPatternOperator op> { + def NAME : sve_float_dot_indexed { + bits<2> iop; + let Inst{20-19} = iop; + let Inst{11-10} = opc; + } def : SVE_4_Op_Imm_Pat(NAME)>; } -- Gitee From d7f1a7150245237ac2eba9b2e0c07bde6536687a Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Sat, 14 Oct 2023 16:35:03 +0100 Subject: [PATCH 33/77] [AArch64][compiler-rt] Only build SME ABI routines when compiler supports asm. (#68991) This also adds the .variant_pcs directive to some functions from which it was previously missing. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- compiler-rt/lib/builtins/CMakeLists.txt | 9 ++-- compiler-rt/lib/builtins/aarch64/sme-abi.S | 59 +++++++--------------- 2 files changed, 25 insertions(+), 43 deletions(-) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 156640958a41..13648185d963 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -553,10 +553,14 @@ set(aarch64_SOURCES ${GENERIC_SOURCES} cpu_model.c aarch64/fp_mode.c - aarch64/sme-abi.S - aarch64/sme-abi-init.c ) +if(COMPILER_RT_HAS_ASM_SME) + list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c) +else() + message(STATUS "AArch64 SME ABI routines disabled") +endif() + # Generate outline atomics helpers from lse.S base set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir") file(MAKE_DIRECTORY "${OA_HELPERS_DIR}") @@ -780,7 +784,6 @@ else () endif() append_list_if(COMPILER_RT_HAS_ASM_LSE HAS_ASM_LSE BUILTIN_DEFS) - append_list_if(COMPILER_RT_HAS_ASM_SME HAS_ASM_SME BUILTIN_DEFS) foreach (arch ${BUILTIN_SUPPORTED_ARCH}) if (CAN_TARGET_${arch}) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 207810b2e252..b3612c68066f 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -8,29 +8,6 @@ #include "../assembly.h" -#ifdef HAS_ASM_SME -#define ARCH armv9-a+sme -#define SMSTOP_SM smstop sm -#define SMSTOP_ZA smstop za -#define REG_TPIDR2_EL0 TPIDR2_EL0 -#define REG_SVCR SVCR -#define ADDSVL_X16_X16_1 addsvl x16, x16, #1 -#define LDR_ZA_W15_0_X16 ldr za[w15,0], [x16] -#define STR_ZA_W15_0_X16 str za[w15,0], [x16] -#define CNTD_X0 cntd x0 -#define CFI_OFFSET_VG_MINUS_16 .cfi_offset vg, -16 -#else -#define ARCH armv8-a -#define SMSTOP_SM .inst 0xd503427f -#define SMSTOP_ZA .inst 0xd503447f -#define REG_TPIDR2_EL0 S3_3_C13_C0_5 -#define REG_SVCR S3_3_C4_C2_2 -#define ADDSVL_X16_X16_1 .inst 0x04305830 -#define LDR_ZA_W15_0_X16 .inst 0xe1006200 -#define STR_ZA_W15_0_X16 .inst 0xe1206200 -#define CNTD_X0 .inst 0x04e0e3e0 -#define CFI_OFFSET_VG_MINUS_16 .cfi_escape 0x10, 0x2e, 0x03, 0x11, 0x70, 0x22 // $vg @ cfa - 16 -#endif #if !defined(__APPLE__) #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) @@ -42,7 +19,7 @@ #define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff #endif -.arch ARCH +.arch armv9-a+sme // Utility function which calls a system's abort() routine. Because the function // is streaming-compatible it should disable streaming-SVE mode before calling @@ -50,19 +27,19 @@ // because the function does not return. DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) .cfi_startproc - .variant_pcs SYMBOL_NAME(do_abort) + .variant_pcs SYMBOL_NAME(do_abort) stp x29, x30, [sp, #-32]! - CNTD_X0 + cntd x0 // Store VG to a stack location that we describe with .cfi_offset str x0, [sp, #16] .cfi_def_cfa_offset 32 .cfi_offset w30, -24 .cfi_offset w29, -32 - CFI_OFFSET_VG_MINUS_16 + .cfi_offset vg, -16 bl __arm_sme_state tbz x0, #0, 2f 1: - SMSTOP_SM + smstop sm 2: // We can't make this into a tail-call because the unwinder would // need to restore the value of VG. @@ -74,7 +51,7 @@ END_COMPILERRT_FUNCTION(do_abort) // that is set as part of the compiler-rt startup code. // __aarch64_has_sme_and_tpidr2_el0 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) - .variant_pcs __arm_sme_state + .variant_pcs __arm_sme_state mov x0, xzr mov x1, xzr @@ -83,18 +60,18 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) cbz w16, 1f 0: orr x0, x0, #0xC000000000000000 - mrs x16, REG_SVCR + mrs x16, SVCR bfxil x0, x16, #0, #2 - mrs x1, REG_TPIDR2_EL0 + mrs x1, TPIDR2_EL0 1: ret END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) - .variant_pcs __arm_tpidr2_restore + .variant_pcs __arm_tpidr2_restore // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific // manner. - mrs x14, REG_TPIDR2_EL0 + mrs x14, TPIDR2_EL0 cbnz x14, 2f // If any of the reserved bytes in the first 16 bytes of BLK are nonzero, @@ -114,8 +91,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) mov x15, xzr 0: - LDR_ZA_W15_0_X16 - ADDSVL_X16_X16_1 + ldr za[w15,0], [x16] + addsvl x16, x16, #1 add x15, x15, #1 cmp x14, x15 b.ne 0b @@ -126,6 +103,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) + .variant_pcs __arm_tpidr2_restore // If the current thread does not have access to TPIDR2_EL0, the subroutine // does nothing. adrp x14, TPIDR2_SYMBOL @@ -133,7 +111,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) cbz w14, 1f // If TPIDR2_EL0 is null, the subroutine does nothing. - mrs x16, REG_TPIDR2_EL0 + mrs x16, TPIDR2_EL0 cbz x16, 1f // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are @@ -153,8 +131,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) mov x15, xzr 0: - STR_ZA_W15_0_X16 - ADDSVL_X16_X16_1 + str za[w15,0], [x16] + addsvl x16, x16, #1 add x15, x15, #1 cmp x14, x15 b.ne 0b @@ -165,6 +143,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) + .variant_pcs __arm_tpidr2_restore // If the current thread does not have access to SME, the subroutine does // nothing. adrp x14, TPIDR2_SYMBOL @@ -182,10 +161,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) bl __arm_tpidr2_save // * Set TPIDR2_EL0 to null. - msr REG_TPIDR2_EL0, xzr + msr TPIDR2_EL0, xzr // * Set PSTATE.ZA to 0. - SMSTOP_ZA + smstop za .cfi_def_cfa wsp, 16 ldp x29, x30, [sp], #16 -- Gitee From 6ed1b7069a4ab90f716a63f68d918366929dc01f Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 17 Oct 2023 16:02:36 +0100 Subject: [PATCH 34/77] [AArch64][SME] Remove immediate argument restriction for svldr and svstr (#68908) The svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently require that the vnum argument be an immediate, but since vnum is used to modify the base register via a mul and add, that restriction is not necessary. This patch removes that restriction. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 10 ++++------ clang/lib/CodeGen/CGBuiltin.cpp | 15 +++++---------- clang/lib/CodeGen/CodeGenFunction.h | 1 - .../aarch64-sme-intrinsics/acle_sme_ldr.c | 16 ++++++++++++++++ .../aarch64-sme-intrinsics/acle_sme_str.c | 15 +++++++++++++++ .../Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp | 8 -------- 6 files changed, 40 insertions(+), 25 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index be9b09980165..538c717eb253 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -44,10 +44,9 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>; -def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQi", "", +def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], - MemEltTyDefault, "aarch64_sme_ldr", - [ImmCheck<2, ImmCheck0_15>]>; + MemEltTyDefault, "aarch64_sme_ldr">; def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], @@ -82,10 +81,9 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>; -def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%i", "", +def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], - MemEltTyDefault, "aarch64_sme_str", - [ImmCheck<2, ImmCheck0_15>]>; + MemEltTyDefault, "aarch64_sme_str">; def SVSTR_ZA : MInst<"svstr_za", "vm%", "", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 57ac75cba54f..094745ac0c06 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9456,11 +9456,6 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, return Store; } -Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) { - llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false); - return Builder.CreateAdd(Base, CastOffset, "tileslice"); -} - Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { @@ -9519,13 +9514,13 @@ Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, if (Ops.size() == 3) { Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - llvm::Value *MulVL = Builder.CreateMul( - CntsbCall, - Builder.getInt64(cast(Ops[2])->getZExtValue()), - "mulvl"); + + llvm::Value *VecNum = Ops[2]; + llvm::Value *MulVL = Builder.CreateMul(CntsbCall, VecNum, "mulvl"); Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); - Ops[0] = EmitTileslice(Ops[0], Ops[2]); + Ops[0] = Builder.CreateAdd( + Ops[0], Builder.CreateIntCast(VecNum, Int32Ty, true), "tileslice"); Ops.erase(&Ops[2]); } Function *F = CGM.getIntrinsic(IntID, {}); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index e44504ac0213..c4795da464d4 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4272,7 +4272,6 @@ public: llvm::Value *EmitSVEMaskedStore(const CallExpr *, SmallVectorImpl &Ops, unsigned BuiltinID); - llvm::Value *EmitTileslice(llvm::Value *Offset, llvm::Value *Base); llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned BuiltinID); diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index acddc2ef50a3..3f8bb6a8cdfe 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -34,6 +34,22 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) // CHECK-NEXT: ret void +// void test_svldr_za(uint32_t slice_base, const void *ptr) { svldr_za(slice_base, ptr); } + +// CHECK-C-LABEL: @test_svldr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { + svldr_vnum_za(slice_base, ptr, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 2728f9ac0cd1..94c95b6664a0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -38,3 +38,18 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { void test_svstr_za(uint32_t slice_base, void *ptr) { svstr_za(slice_base, ptr); } + +// CHECK-C-LABEL: @test_svstr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM:%.*]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { + svstr_vnum_za(slice_base, ptr, vnum); +} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 9b88d463d5e2..68395a39b878 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -138,11 +138,6 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, slice, pg, ptr, 1); - // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, ptr, 16); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} - SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, ptr, -1); - // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, slice); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} @@ -166,9 +161,6 @@ void test_constant(uint64_t u64, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}} - SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}} - SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}} - SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}} SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}} } -- Gitee From 5f615a21198a64cdb1dee14dc407c89b3a09406b Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 18 Oct 2023 10:39:43 +0100 Subject: [PATCH 35/77] [compiler-rt] Only build SME ABI routines for baremetal or platforms that have sys/auxv.h (#69423) This avoids link failures on other platorms that don't (yet) have an implementation of __aarch64_sme_accessible. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- compiler-rt/lib/builtins/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 13648185d963..d08bdd1959b8 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -555,8 +555,9 @@ set(aarch64_SOURCES aarch64/fp_mode.c ) -if(COMPILER_RT_HAS_ASM_SME) +if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD)) list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c) + message(STATUS "AArch64 SME ABI routines enabled") else() message(STATUS "AArch64 SME ABI routines disabled") endif() -- Gitee From 48e542672311434127f48a2acf1229468d0560dc Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 30 Oct 2023 10:47:07 +0000 Subject: [PATCH 36/77] [AArch64][SME] Allow inlining when streaming-mode attributes dont match up. (#68415) The use-case here is to support things like: int foo(int x, int y) __arm_streaming { return std::max(x, y); } where the call to non-streaming `std::max(x, y)` can be safely inlined into the streaming function. This is a first step and will need further work to allow more cases (e.g. more finegrained analysis of the function calls to ensure they don't result in any incompatible instructions for the requested mode). Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../AArch64/AArch64TargetTransformInfo.cpp | 41 +++++- .../Inline/AArch64/sme-pstatesm-attrs.ll | 138 ++++++++++++++---- .../Inline/AArch64/sme-pstateza-attrs.ll | 79 +++++++++- 3 files changed, 218 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 06bb85768725..17d42889d63c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -190,16 +190,49 @@ static cl::opt EnableFixedwidthAutovecInStreamingMode( static cl::opt EnableScalableAutovecInStreamingMode( "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); +static bool isSMEABIRoutineCall(const CallInst &CI) { + const auto *F = CI.getCalledFunction(); + return F && StringSwitch(F->getName()) + .Case("__arm_sme_state", true) + .Case("__arm_tpidr2_save", true) + .Case("__arm_tpidr2_restore", true) + .Case("__arm_za_disable", true) + .Default(false); +} + +/// Returns true if the function has explicit operations that can only be +/// lowered using incompatible instructions for the selected mode. This also +/// returns true if the function F may use or modify ZA state. +static bool hasPossibleIncompatibleOps(const Function *F) { + for (const BasicBlock &BB : *F) { + for (const Instruction &I : BB) { + // Be conservative for now and assume that any call to inline asm or to + // intrinsics could could result in non-streaming ops (e.g. calls to + // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that + // all native LLVM instructions can be lowered to compatible instructions. + if (isa(I) && !I.isDebugOrPseudoInst() && + (cast(I).isInlineAsm() || isa(I) || + isSMEABIRoutineCall(cast(I)))) + return true; + } + } + return false; +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller); SMEAttrs CalleeAttrs(*Callee); - if (CallerAttrs.requiresSMChange(CalleeAttrs, - /*BodyOverridesInterface=*/true) || - CallerAttrs.requiresLazySave(CalleeAttrs) || - CalleeAttrs.hasNewZABody()) + if (CalleeAttrs.hasNewZABody()) return false; + if (CallerAttrs.requiresLazySave(CalleeAttrs) || + CallerAttrs.requiresSMChange(CalleeAttrs, + /*BodyOverridesInterface=*/true)) { + if (hasPossibleIncompatibleOps(Callee)) + return false; + } + const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll index 3df5400875ae..f2f5768dbe9c 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll @@ -102,11 +102,11 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_streaming_callee_dont_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_callee_dont_inline +define void @normal_caller_streaming_callee_inline() { +; CHECK-LABEL: define void @normal_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -136,11 +136,11 @@ entry: ; [ ] N -> SC ; [x] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_locally_streaming_callee_dont_inline() { -; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_dont_inline +define void @normal_caller_locally_streaming_callee_inline() { +; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -153,11 +153,11 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [x] N -> SC + B -define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline +define void @normal_caller_streaming_compatible_locally_streaming_callee_inline() { +; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -170,11 +170,11 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_normal_callee_dont_inline +define void @streaming_caller_normal_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define void @streaming_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -255,11 +255,11 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_dont_inline +define void @locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -340,11 +340,11 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_dont_inline +define void @streaming_compatible_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -357,11 +357,11 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_dont_inline +define void @streaming_compatible_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -391,11 +391,11 @@ entry: ; [ ] SC -> SC ; [x] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_dont_inline +define void @streaming_compatible_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -408,11 +408,11 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [x] SC -> SC + B -define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline +define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -424,11 +424,11 @@ entry: ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline +define void @streaming_compatible_locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -503,3 +503,81 @@ entry: call void @streaming_compatible_locally_streaming_callee() ret void } + +define void @normal_callee_with_inlineasm() { +; CHECK-LABEL: define void @normal_callee_with_inlineasm +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; +entry: + call void asm sideeffect "; inlineasm", ""() + ret void +} + +define void @streaming_caller_normal_callee_with_inlineasm_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define void @streaming_caller_normal_callee_with_inlineasm_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee_with_inlineasm() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee_with_inlineasm() + ret void +} + +define i64 @normal_callee_with_intrinsic_call() { +; CHECK-LABEL: define i64 @normal_callee_with_intrinsic_call +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.aarch64.sve.cntb(i32 4) +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %res = call i64 @llvm.aarch64.sve.cntb(i32 4) + ret i64 %res +} + +define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call i64 @normal_callee_with_intrinsic_call() +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %res = call i64 @normal_callee_with_intrinsic_call() + ret i64 %res +} + +declare i64 @llvm.aarch64.sve.cntb(i32) + +define i64 @normal_callee_call_sme_state() { +; CHECK-LABEL: define i64 @normal_callee_call_sme_state +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call { i64, i64 } @__arm_sme_state() +; CHECK-NEXT: [[RES_0:%.*]] = extractvalue { i64, i64 } [[RES]], 0 +; CHECK-NEXT: ret i64 [[RES_0]] +; +entry: + %res = call {i64, i64} @__arm_sme_state() + %res.0 = extractvalue {i64, i64} %res, 0 + ret i64 %res.0 +} + +declare {i64, i64} @__arm_sme_state() + +define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call i64 @normal_callee_call_sme_state() +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %res = call i64 @normal_callee_call_sme_state() + ret i64 %res +} diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll index a833e7a911ac..7fca45b1e43f 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll @@ -3,10 +3,12 @@ declare void @inlined_body() +; ; Define some functions that will be called by the functions below. ; These just call a '...body()' function. If we see the call to one of ; these functions being replaced by '...body()', then we know it has been ; inlined. +; define void @nonza_callee() { ; CHECK-LABEL: define void @nonza_callee @@ -42,6 +44,7 @@ define void @new_za_callee() "aarch64_pstate_za_new" { ret void } +; ; Now test that inlining only happens when no lazy-save is needed. ; Test for a number of combinations, where: ; N Not using ZA. @@ -81,11 +84,11 @@ entry: ; [x] Z -> N ; [ ] Z -> S ; [ ] Z -> Z -define void @new_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_new" { -; CHECK-LABEL: define void @new_za_caller_nonza_callee_dont_inline +define void @new_za_caller_nonza_callee_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: define void @new_za_caller_nonza_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @nonza_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -126,11 +129,11 @@ entry: ; [x] Z -> N ; [ ] Z -> S ; [ ] Z -> Z -define void @shared_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_shared" { -; CHECK-LABEL: define void @shared_za_caller_nonza_callee_dont_inline +define void @shared_za_caller_nonza_callee_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define void @shared_za_caller_nonza_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @nonza_callee() +; CHECK-NEXT: call void @inlined_body() ; CHECK-NEXT: ret void ; entry: @@ -167,3 +170,67 @@ entry: call void @shared_za_callee() ret void } + +define void @private_za_callee_call_za_disable() { +; CHECK-LABEL: define void @private_za_callee_call_za_disable +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: call void @__arm_za_disable() +; CHECK-NEXT: ret void +; + call void @__arm_za_disable() + ret void +} + +define void @shared_za_caller_private_za_callee_call_za_disable() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_za_disable +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: call void @private_za_callee_call_za_disable() +; CHECK-NEXT: ret void +; + call void @private_za_callee_call_za_disable() + ret void +} + +define void @private_za_callee_call_tpidr2_save() { +; CHECK-LABEL: define void @private_za_callee_call_tpidr2_save +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: call void @__arm_tpidr2_save() +; CHECK-NEXT: ret void +; + call void @__arm_tpidr2_save() + ret void +} + +define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: call void @private_za_callee_call_tpidr2_save() +; CHECK-NEXT: ret void +; + call void @private_za_callee_call_tpidr2_save() + ret void +} + +define void @private_za_callee_call_tpidr2_restore(ptr %ptr) { +; CHECK-LABEL: define void @private_za_callee_call_tpidr2_restore +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @__arm_tpidr2_restore(ptr [[PTR]]) +; CHECK-NEXT: ret void +; + call void @__arm_tpidr2_restore(ptr %ptr) + ret void +} + +define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline(ptr %ptr) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: call void @private_za_callee_call_tpidr2_restore(ptr [[PTR]]) +; CHECK-NEXT: ret void +; + call void @private_za_callee_call_tpidr2_restore(ptr %ptr) + ret void +} + +declare void @__arm_za_disable() +declare void @__arm_tpidr2_save() +declare void @__arm_tpidr2_restore(ptr) -- Gitee From 7a86121ad53034f1b0404ec2433450ddae40100d Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 27 Oct 2023 16:30:55 +0000 Subject: [PATCH 37/77] [compiler-rt] Don't use 'vg' in CFI directives for SME ABI routines This broke some builds where GNU assembler doesn't support 'vg'. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index b3612c68066f..d470ecaf7aaa 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -35,7 +35,7 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) .cfi_def_cfa_offset 32 .cfi_offset w30, -24 .cfi_offset w29, -32 - .cfi_offset vg, -16 + .cfi_offset 46, -16 bl __arm_sme_state tbz x0, #0, 2f 1: -- Gitee From 38ee64e4fe316884531d097305142f70da175122 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 2 Nov 2023 15:47:37 +0000 Subject: [PATCH 38/77] [AArch64][Clang] Refactor code to emit SVE & SME builtins (#70959) This patch removes duplicated code in EmitAArch64SVEBuiltinExpr and EmitAArch64SMEBuiltinExpr by creating a new function called GetAArch64SVEProcessedOperands which handles splitting up multi-vector arguments using vector extracts. These changes are non-functional. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/CodeGen/CGBuiltin.cpp | 151 ++--- clang/lib/CodeGen/CodeGenFunction.h | 5 + .../acle_sve_st2-bfloat.c | 36 +- .../aarch64-sve-intrinsics/acle_sve_st2.c | 356 ++++++------ .../acle_sve_st3-bfloat.c | 44 +- .../aarch64-sve-intrinsics/acle_sve_st3.c | 436 +++++++-------- .../acle_sve_st4-bfloat.c | 52 +- .../aarch64-sve-intrinsics/acle_sve_st4.c | 516 +++++++++--------- 8 files changed, 804 insertions(+), 792 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 094745ac0c06..d15194ae50e1 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9327,23 +9327,19 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags, Value *BasePtr = Builder.CreateBitCast(Ops[1], VecPtrTy); // Does the store have an offset? - if (Ops.size() > 3) + if (Ops.size() > (2 + N)) BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy); - Value *Val = Ops.back(); - + // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we // need to break up the tuple vector. SmallVector Operands; - unsigned MinElts = VTy->getElementCount().getKnownMinValue(); - for (unsigned I = 0; I < N; ++I) { - Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); - Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx)); - } + for (unsigned I = Ops.size() - N; I < Ops.size(); ++I) + Operands.push_back(Ops[I]); Operands.append({Predicate, BasePtr}); - Function *F = CGM.getIntrinsic(IntID, { VTy }); + return Builder.CreateCall(F, Operands); } @@ -9615,26 +9611,24 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags, return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SVEProcessedOperands( + unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, + SVETypeFlags TypeFlags) { // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && - BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { - Value *Val = EmitScalarExpr(E->getArg(0)); - return EmitSVEReinterpret(Val, Ty); - } + // Tuple set/get only requires one insert/extract vector, which is + // created by EmitSVETupleSetOrGet. + bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); - llvm::SmallVector Ops; for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - if ((ICEArguments & (1 << i)) == 0) - Ops.push_back(EmitScalarExpr(E->getArg(i))); - else { + bool IsICE = ICEArguments & (1 << i); + Value *Arg = EmitScalarExpr(E->getArg(i)); + + if (IsICE) { // If this is required to be a constant, constant fold it so that we know // that the generated intrinsic gets a ConstantInt. std::optional Result = @@ -9646,12 +9640,49 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, // immediate requires more than a handful of bits. *Result = Result->extOrTrunc(32); Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result)); + continue; + } + + if (IsTupleGetOrSet || !isa(Arg->getType())) { + Ops.push_back(Arg); + continue; + } + + auto *VTy = cast(Arg->getType()); + unsigned MinElts = VTy->getMinNumElements(); + bool IsPred = VTy->getElementType()->isIntegerTy(1); + unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128); + + if (N == 1) { + Ops.push_back(Arg); + continue; + } + + for (unsigned I = 0; I < N; ++I) { + Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N); + auto *NewVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts / N); + Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx)); } } +} + +Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + llvm::Type *Ty = ConvertType(E->getType()); + if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && + BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { + Value *Val = EmitScalarExpr(E->getArg(0)); + return EmitSVEReinterpret(Val, Ty); + } auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID, AArch64SVEIntrinsicsProvenSorted); + + llvm::SmallVector Ops; SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + if (TypeFlags.isLoad()) return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic, TypeFlags.isZExtReturn()); @@ -9665,14 +9696,14 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isGatherPrefetch()) return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructLoad()) - return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructStore()) - return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructLoad()) + return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructStore()) + return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) - return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops); + return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops); else if (TypeFlags.isTupleCreate()) - return EmitSVETupleCreate(TypeFlags, Ty, Ops); + return EmitSVETupleCreate(TypeFlags, Ty, Ops); else if (TypeFlags.isUndef()) return UndefValue::get(Ty); else if (Builtin->LLVMIntrinsic != 0) { @@ -9891,13 +9922,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, case SVE::BI__builtin_sve_svtbl2_f64: { SVETypeFlags TF(Builtin->TypeModifier); auto VTy = cast(getSVEType(TF)); - Value *V0 = Builder.CreateExtractVector(VTy, Ops[0], - ConstantInt::get(CGM.Int64Ty, 0)); - unsigned MinElts = VTy->getMinNumElements(); - Value *V1 = Builder.CreateExtractVector( - VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts)); Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy); - return Builder.CreateCall(F, {V0, V1, Ops[1]}); + return Builder.CreateCall(F, Ops); } case SVE::BI__builtin_sve_svset_neonq_s8: @@ -9955,35 +9981,13 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { - // Find out if any arguments are required to be integer constant expressions. - unsigned ICEArguments = 0; - ASTContext::GetBuiltinTypeError Error; - getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); - assert(Error == ASTContext::GE_None && "Should not codegen an error"); - - llvm::Type *Ty = ConvertType(E->getType()); - llvm::SmallVector Ops; - for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - if ((ICEArguments & (1 << i)) == 0) - Ops.push_back(EmitScalarExpr(E->getArg(i))); - else { - // If this is required to be a constant, constant fold it so that we know - // that the generated intrinsic gets a ConstantInt. - std::optional Result = - E->getArg(i)->getIntegerConstantExpr(getContext()); - assert(Result && "Expected argument to be a constant"); - - // Immediates for SVE llvm intrinsics are always 32bit. We can safely - // truncate because the immediate has been range checked and no valid - // immediate requires more than a handful of bits. - *Result = Result->extOrTrunc(32); - Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result)); - } - } - auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID, AArch64SMEIntrinsicsProvenSorted); + + llvm::SmallVector Ops; SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + if (TypeFlags.isLoad() || TypeFlags.isStore()) return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) @@ -9996,21 +10000,24 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, BuiltinID == SME::BI__builtin_sme_svldr_za || BuiltinID == SME::BI__builtin_sme_svstr_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (Builtin->LLVMIntrinsic != 0) { - // Predicates must match the main datatype. - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (auto PredTy = dyn_cast(Ops[i]->getType())) - if (PredTy->getElementType()->isIntegerTy(1)) - Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags)); - Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, - getSVEOverloadTypes(TypeFlags, Ty, Ops)); - Value *Call = Builder.CreateCall(F, Ops); - return Call; - } + // Should not happen! + if (Builtin->LLVMIntrinsic == 0) + return nullptr; - /// Should not happen - return nullptr; + // Predicates must match the main datatype. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (auto PredTy = dyn_cast(Ops[i]->getType())) + if (PredTy->getElementType()->isIntegerTy(1)) + Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags)); + + Function *F = + TypeFlags.isOverloadNone() + ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) + : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); + Value *Call = Builder.CreateCall(F, Ops); + + return FormSVEBuiltinResult(Call); } Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index c4795da464d4..0553b4651fe9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4298,6 +4298,11 @@ public: llvm::Value *EmitSMELdrStr(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); + + void GetAArch64SVEProcessedOperands(unsigned BuiltinID, const CallExpr *E, + SmallVectorImpl &Ops, + SVETypeFlags TypeFlags); + llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c index fcfa7eb57683..a9a285bdbfb8 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c @@ -16,18 +16,18 @@ #endif // CHECK-LABEL: define {{[^@]+}}@test_svst2_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) @@ -37,20 +37,20 @@ void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c index 3235a8b9e111..dece2ca82c53 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c @@ -35,18 +35,18 @@ void test_svst2_s8(svbool_t pg, int8_t *base, svint8x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data) @@ -56,18 +56,18 @@ void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data) @@ -77,18 +77,18 @@ void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data) @@ -117,18 +117,18 @@ void test_svst2_u8(svbool_t pg, uint8_t *base, svuint8x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data) @@ -138,18 +138,18 @@ void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data) @@ -159,18 +159,18 @@ void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data) @@ -180,18 +180,18 @@ void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data) @@ -201,18 +201,18 @@ void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data) @@ -222,18 +222,18 @@ void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data) @@ -243,18 +243,18 @@ void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data) @@ -264,20 +264,20 @@ void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data) @@ -287,20 +287,20 @@ void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t d // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data) @@ -310,20 +310,20 @@ void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t d // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data) @@ -333,18 +333,18 @@ void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t d // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data) @@ -354,20 +354,20 @@ void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t da // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data) @@ -377,20 +377,20 @@ void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data) @@ -400,20 +400,20 @@ void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data) @@ -423,20 +423,20 @@ void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data) @@ -446,20 +446,20 @@ void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data) @@ -469,20 +469,20 @@ void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c index 096699191e88..ffd24a45b94e 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c @@ -17,20 +17,20 @@ // CHECK-LABEL: define {{[^@]+}}@test_svst3_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) @@ -40,22 +40,22 @@ void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c index a962f0734dc9..d52ff0b6db88 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c @@ -37,20 +37,20 @@ void test_svst3_s8(svbool_t pg, int8_t *base, svint8x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data) @@ -60,20 +60,20 @@ void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data) @@ -83,20 +83,20 @@ void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data) @@ -127,20 +127,20 @@ void test_svst3_u8(svbool_t pg, uint8_t *base, svuint8x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data) @@ -150,20 +150,20 @@ void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data) @@ -173,20 +173,20 @@ void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data) @@ -196,20 +196,20 @@ void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data) @@ -219,20 +219,20 @@ void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data) @@ -242,20 +242,20 @@ void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data) @@ -265,20 +265,20 @@ void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data) @@ -288,22 +288,22 @@ void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data) @@ -313,22 +313,22 @@ void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t d // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data) @@ -338,22 +338,22 @@ void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t d // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data) @@ -363,20 +363,20 @@ void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t d // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data) @@ -386,22 +386,22 @@ void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t da // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data) @@ -411,22 +411,22 @@ void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data) @@ -436,22 +436,22 @@ void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data) @@ -461,22 +461,22 @@ void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data) @@ -486,22 +486,22 @@ void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data) @@ -511,22 +511,22 @@ void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c index 484fc6333b57..80efdf9df65c 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c @@ -17,22 +17,22 @@ // CHECK-LABEL: define {{[^@]+}}@test_svst4_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) @@ -42,24 +42,24 @@ void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c index 774460c91519..c61e78a941b2 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c @@ -39,22 +39,22 @@ void test_svst4_s8(svbool_t pg, int8_t *base, svint8x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data) @@ -64,22 +64,22 @@ void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data) @@ -89,22 +89,22 @@ void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data) @@ -137,22 +137,22 @@ void test_svst4_u8(svbool_t pg, uint8_t *base, svuint8x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data) @@ -162,22 +162,22 @@ void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data) @@ -187,22 +187,22 @@ void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data) @@ -212,22 +212,22 @@ void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data) @@ -237,22 +237,22 @@ void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data) @@ -262,22 +262,22 @@ void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data) @@ -287,22 +287,22 @@ void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data) // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data) @@ -312,24 +312,24 @@ void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data) @@ -339,24 +339,24 @@ void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t d // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data) @@ -366,24 +366,24 @@ void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t d // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data) @@ -393,22 +393,22 @@ void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t d // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data) @@ -418,24 +418,24 @@ void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t da // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data) @@ -445,24 +445,24 @@ void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data) @@ -472,24 +472,24 @@ void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data) @@ -499,24 +499,24 @@ void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data) @@ -526,24 +526,24 @@ void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data) @@ -553,24 +553,24 @@ void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data) -- Gitee From db571b05965d61c1ee6897a55ab3b56c6bc195fb Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 1 Nov 2023 16:27:29 +0000 Subject: [PATCH 39/77] [NFC][LLVM][SVE] Refactor predicate register ASM constraint parsing to use std::optional. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 790a36929a0c..70017cc82f06 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9986,14 +9986,15 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { return "r"; } -enum PredicateConstraint { Uph, Upl, Upa, Invalid }; +enum class PredicateConstraint { Uph, Upl, Upa }; -static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { - return StringSwitch(Constraint) +static std::optional +parsePredicateConstraint(StringRef Constraint) { + return StringSwitch>(Constraint) .Case("Uph", PredicateConstraint::Uph) .Case("Upl", PredicateConstraint::Upl) .Case("Upa", PredicateConstraint::Upa) - .Default(PredicateConstraint::Invalid); + .Default(std::nullopt); } static const TargetRegisterClass * @@ -10003,8 +10004,6 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { return nullptr; switch (Constraint) { - default: - return nullptr; case PredicateConstraint::Uph: return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass : &AArch64::PPR_p8to15RegClass; @@ -10014,6 +10013,8 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass : &AArch64::PPRRegClass; } + + llvm_unreachable("Missing PredicateConstraint!"); } // The set of cc code supported is from @@ -10111,9 +10112,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const { case 'S': // A symbolic address return C_Other; } - } else if (parsePredicateConstraint(Constraint) != - PredicateConstraint::Invalid) - return C_RegisterClass; + } else if (parsePredicateConstraint(Constraint)) + return C_RegisterClass; else if (parseConstraintCode(Constraint) != AArch64CC::Invalid) return C_Other; return TargetLowering::getConstraintType(Constraint); @@ -10147,7 +10147,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( weight = CW_Constant; break; case 'U': - if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) + if (parsePredicateConstraint(constraint)) weight = CW_Register; break; } @@ -10204,9 +10204,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( break; } } else { - PredicateConstraint PC = parsePredicateConstraint(Constraint); - if (const TargetRegisterClass *RegClass = getPredicateRegisterClass(PC, VT)) - return std::make_pair(0U, RegClass); + if (const auto PC = parsePredicateConstraint(Constraint)) + if (const auto *RegClass = getPredicateRegisterClass(*PC, VT)) + return std::make_pair(0U, RegClass); } if (StringRef("{cc}").equals_insensitive(Constraint) || parseConstraintCode(Constraint) != AArch64CC::Invalid) -- Gitee From ae2389cebd290cbfd83182a65dad46a0f0af1262 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 3 Nov 2023 15:34:45 +0000 Subject: [PATCH 40/77] [LLVM][AArch64] Add ASM constraints for reduced GPR register ranges. (#70970) [LLVM][AArch64] Add ASM constraints for reduced GPR register ranges. The patch adds the follow ASM constraints: Uci => w8-w11/x8-x11 Ucj => w12-w15/x12-x15 These constraints are required for SME load/store instructions where a reduced set of GPRs are used to specify ZA array vectors. NOTE: GCC has agreed to use the same constraint syntax. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/docs/ReleaseNotes.rst | 5 ++ clang/lib/Basic/Targets/AArch64.cpp | 6 ++ clang/test/CodeGen/aarch64-inline-asm.c | 15 ++++ llvm/docs/LangRef.rst | 2 + .../Target/AArch64/AArch64ISelLowering.cpp | 34 +++++++- .../AArch64/inlineasm-Uc-constraint.ll | 78 +++++++++++++++++++ 6 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7b199ffd0397..4f81412bd6f1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -938,6 +938,11 @@ Arm and AArch64 Support and the selected processor lacks floating point registers. (`#55755 `_) +- New AArch64 asm constraints have been added for r8-r11(Uci) and r12-r15(Ucj). + +Android Support +^^^^^^^^^^^^^^^ + - Clang builtin ``__arithmetic_fence`` and the command line option ``-fprotect-parens`` are now enabled for AArch64. diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 6d22b65df3da..e64e8bb23d20 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1309,6 +1309,12 @@ bool AArch64TargetInfo::validateAsmConstraint( Name += 2; return true; } + if (Name[1] == 'c' && (Name[2] == 'i' || Name[2] == 'j')) { + // Gpr registers ("Uci"=w8-11, "Ucj"=w12-15) + Info.setAllowsRegister(); + Name += 2; + return true; + } // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes. // Utf: A memory address suitable for ldp/stp in TF mode. // Usa: An absolute symbolic address. diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c index 439fb9e33f9a..75e9a8c46b87 100644 --- a/clang/test/CodeGen/aarch64-inline-asm.c +++ b/clang/test/CodeGen/aarch64-inline-asm.c @@ -80,3 +80,18 @@ void test_tied_earlyclobber(void) { asm("" : "+&r"(a)); // CHECK: call i32 asm "", "=&{x1},0"(i32 %0) } + +void test_reduced_gpr_constraints(int var32, long var64) { + asm("add w0, w0, %0" : : "Uci"(var32) : "w0"); +// CHECK: [[ARG1:%.+]] = load i32, ptr +// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Uci,~{w0}"(i32 [[ARG1]]) + asm("add x0, x0, %0" : : "Uci"(var64) : "x0"); +// CHECK: [[ARG1:%.+]] = load i64, ptr +// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 [[ARG1]]) + asm("add w0, w0, %0" : : "Ucj"(var32) : "w0"); +// CHECK: [[ARG2:%.+]] = load i32, ptr +// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Ucj,~{w0}"(i32 [[ARG2]]) + asm("add x0, x0, %0" : : "Ucj"(var64) : "x0"); +// CHECK: [[ARG2:%.+]] = load i64, ptr +// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]]) +} diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 4fd47bb2bbda..2f4d01c4cddd 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4991,6 +4991,8 @@ AArch64: offsets). (However, LLVM currently does this for the ``m`` constraint as well.) - ``r``: A 32 or 64-bit integer register (W* or X*). +- ``Uci``: Like r, but restricted to registers 8 to 11 inclusive. +- ``Ucj``: Like r, but restricted to registers 12 to 15 inclusive. - ``w``: A 32, 64, or 128-bit floating-point, SIMD or SVE vector register. - ``x``: Like w, but restricted to registers 0 to 15 inclusive. - ``y``: Like w, but restricted to SVE vector registers Z0 to Z7 inclusive. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 70017cc82f06..c0a670d3ded7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10017,6 +10017,31 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { llvm_unreachable("Missing PredicateConstraint!"); } +enum class ReducedGprConstraint { Uci, Ucj }; + +static std::optional +parseReducedGprConstraint(StringRef Constraint) { + return StringSwitch>(Constraint) + .Case("Uci", ReducedGprConstraint::Uci) + .Case("Ucj", ReducedGprConstraint::Ucj) + .Default(std::nullopt); +} + +static const TargetRegisterClass * +getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) { + if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64) + return nullptr; + + switch (Constraint) { + case ReducedGprConstraint::Uci: + return &AArch64::MatrixIndexGPR32_8_11RegClass; + case ReducedGprConstraint::Ucj: + return &AArch64::MatrixIndexGPR32_12_15RegClass; + } + + llvm_unreachable("Missing ReducedGprConstraint!"); +} + // The set of cc code supported is from // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { @@ -10114,6 +10139,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const { } } else if (parsePredicateConstraint(Constraint)) return C_RegisterClass; + else if (parseReducedGprConstraint(Constraint)) + return C_RegisterClass; else if (parseConstraintCode(Constraint) != AArch64CC::Invalid) return C_Other; return TargetLowering::getConstraintType(Constraint); @@ -10147,7 +10174,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( weight = CW_Constant; break; case 'U': - if (parsePredicateConstraint(constraint)) + if (parsePredicateConstraint(constraint) || + parseReducedGprConstraint(constraint)) weight = CW_Register; break; } @@ -10207,6 +10235,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( if (const auto PC = parsePredicateConstraint(Constraint)) if (const auto *RegClass = getPredicateRegisterClass(*PC, VT)) return std::make_pair(0U, RegClass); + + if (const auto RGC = parseReducedGprConstraint(Constraint)) + if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT)) + return std::make_pair(0U, RegClass); } if (StringRef("{cc}").equals_insensitive(Constraint) || parseConstraintCode(Constraint) != AArch64CC::Invalid) diff --git a/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll new file mode 100644 index 000000000000..0bee7ea40cc1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -o - | FileCheck %s + +target triple = "arm64-none-linux-gnu" + +define void @test_constraints_Uci_w(i32 %a) { +; CHECK-LABEL: test_constraints_Uci_w: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i32 %a) + ret void +} + +; As test_constraints_Uci_w but ensures non-legal types are also covered. +define void @test_constraints_Uci_w_i8(i8 %a) { +; CHECK-LABEL: test_constraints_Uci_w_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i8 %a) + ret void +} + +define void @test_constraints_Uci_x(i64 %a) { +; CHECK-LABEL: test_constraints_Uci_x: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 %a) + ret void +} + +define void @test_constraint_Ucj_w(i32 %a) { +; CHECK-LABEL: test_constraint_Ucj_w: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x12 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i32 %a) + ret void +} + +; As test_constraints_Ucj_w but ensures non-legal types are also covered. +define void @test_constraint_Ucj_w_i8(i8 %a) { +; CHECK-LABEL: test_constraint_Ucj_w_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x12 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i8 %a) + ret void +} + +define void @test_constraint_Ucj_x(i64 %a) { +; CHECK-LABEL: test_constraint_Ucj_x: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x12, x0 +; CHECK-NEXT: //APP +; CHECK-NEXT: add x0, x0, x12 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ret + call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 %a) + ret void +} -- Gitee From 231253aeb58500904832ec9941e83d6fb4024ca2 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 7 Nov 2023 15:42:43 +0000 Subject: [PATCH 41/77] [Clang][SME2] Add multi-vector add/sub builtins (#69725) Adds the following SME2 builtins: - sv(add|sub) - sv(add|sub)_za32/za64, - sv(add|sub)_write_za32/za64 Other changes in this patch: - CGBuiltin.cpp: The GetAArch64SMEProcessedOperands function is created to avoid duplicating existing code from EmitAArch64SVEBuiltinExpr. - arm_sve.td: The add/sub SME2 builtins which do not operate on ZA have been added to arm_sve.td, matching the corrosponding LLVM IR intrinsic names which start with @llvm.aarch64.sve for this reason. - SveEmitter.cpp: Adds the createCoreHeaderIntrinsics function to remove duplicated code in createHeader & createSMEHeader. Uses a new enum (ACLEKind) to choose either "__builtin_sme_" or "__builtin_sve_" when emitting the intrinsics. See https://github.com/ARM-software/acle/pull/217/files Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 35 + clang/include/clang/Basic/arm_sve.td | 12 + clang/include/clang/Basic/arm_sve_sme_incl.td | 2 +- .../aarch64-sme2-intrinsics/acle_sme2_add.c | 649 ++++++++++++++++++ .../aarch64-sme2-intrinsics/acle_sme2_sub.c | 649 ++++++++++++++++++ .../acle_sme2_vector_add.c | 539 +++++++++++++++ clang/utils/TableGen/SveEmitter.cpp | 100 +-- 7 files changed, 1935 insertions(+), 51 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 538c717eb253..ebee5a9bcd50 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -263,3 +263,38 @@ multiclass ZAFPOuterProd { defm SVMOPA : ZAFPOuterProd<"mopa">; defm SVMOPS : ZAFPOuterProd<"mops">; + +//////////////////////////////////////////////////////////////////////////////// +// SME2 - ADD, SUB + +multiclass ZAAddSub { + let TargetGuard = "sme2" in { + def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>; + + let TargetGuard = "sme-i16i64" in { + def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + + def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + } + + let TargetGuard = "sme-f64f64" in { + def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + } + } +} + +defm SVADD : ZAAddSub<"add">; +defm SVSUB : ZAAddSub<"sub">; diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 894a0a1296b0..401811688990 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1868,3 +1868,15 @@ let TargetGuard = "sve2p1" in { def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sclamp", [], []>; def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>; } + +//////////////////////////////////////////////////////////////////////////////// +// SME2 + +// SME intrinsics which operate only on vectors and do not require ZA should be added here, +// as they could possibly become SVE instructions in the future. + +let TargetGuard = "sme2" in { +// == ADD (vectors) == + def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>; + def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>; +} diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 74c9b9266771..d97d8ea0fac5 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -254,7 +254,7 @@ class ImmCheck { } class Inst ft, list ch, MemEltType met> { + list ft, list ch, MemEltType met = MemEltTyDefault> { string Name = n; string Prototype = p; string Types = t; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c new file mode 100644 index 000000000000..dd96dca70d63 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -0,0 +1,649 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +// CHECK-LABEL: @test_svadd_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svadd_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +// CHECK-LABEL: @test_svadd_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svadd_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svadd_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +// CHECK-LABEL: @test_svadd_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn); +} + +// x4 + +// CHECK-LABEL: @test_svadd_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svadd_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c new file mode 100644 index 000000000000..9570deab0b39 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -0,0 +1,649 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +// CHECK-LABEL: @test_svsub_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svsub_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +// CHECK-LABEL: @test_svsub_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s32j11svint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s64j11svint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); +} + +// x4 + +// CHECK-LABEL: @test_svsub_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s32j11svint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s64j11svint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); +} + +// CHECK-LABEL: @test_svsub_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +// CHECK-LABEL: @test_svsub_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn); +} + +// x4 + +// CHECK-LABEL: @test_svsub_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn); +} + +// CHECK-LABEL: @test_svsub_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c new file mode 100644 index 000000000000..85c4b9b09546 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c @@ -0,0 +1,539 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi-Single Vector +// + +// x2 + +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm); +} + + +// x4 + +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm); +} + +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) __arm_streaming { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index f725c3954005..545f2bcc7f62 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -43,6 +43,8 @@ enum ClassKind { ClassG, // Overloaded name without type suffix }; +enum class ACLEKind { SVE, SME }; + using TypeSpec = std::string; namespace { @@ -236,7 +238,7 @@ public: } /// Emits the intrinsic declaration to the ostream. - void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const; + void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, ACLEKind Kind) const; private: std::string getMergeSuffix() const { return MergeSuffix; } @@ -344,6 +346,10 @@ public: /// Emit arm_sve.h. void createHeader(raw_ostream &o); + // Emits core intrinsics in both arm_sme.h and arm_sve.h + void createCoreHeaderIntrinsics(raw_ostream &o, SVEEmitter &Emitter, + ACLEKind Kind); + /// Emit all the __builtin prototypes and code needed by Sema. void createBuiltins(raw_ostream &o); @@ -987,7 +993,8 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const { getMergeSuffix(); } -void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { +void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, + ACLEKind Kind) const { bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1; std::string FullName = mangleName(ClassS); @@ -1004,9 +1011,17 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { SMEAttrs += ", arm_preserves_za"; OS << (IsOverloaded ? "__aio " : "__ai ") - << "__attribute__((__clang_arm_builtin_alias(" - << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_") - << FullName << ")"; + << "__attribute__((__clang_arm_builtin_alias("; + + switch (Kind) { + case ACLEKind::SME: + OS << "__builtin_sme_" << FullName << ")"; + break; + case ACLEKind::SVE: + OS << "__builtin_sve_" << FullName << ")"; + break; + } + if (!SMEAttrs.empty()) OS << SMEAttrs; OS << "))\n"; @@ -1143,6 +1158,34 @@ void SVEEmitter::createIntrinsic( } } +void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS, + SVEEmitter &Emitter, + ACLEKind Kind) { + SmallVector, 128> Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs); + + // Sort intrinsics in header file by following order/priority: + // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) + // - Class (is intrinsic overloaded or not) + // - Intrinsic name + std::stable_sort(Defs.begin(), Defs.end(), + [](const std::unique_ptr &A, + const std::unique_ptr &B) { + auto ToTuple = [](const std::unique_ptr &I) { + return std::make_tuple(I->getGuard(), + (unsigned)I->getClassKind(), + I->getName()); + }; + return ToTuple(A) < ToTuple(B); + }); + + // Actually emit the intrinsic declarations. + for (auto &I : Defs) + I->emitIntrinsic(OS, Emitter, Kind); +} + void SVEEmitter::createHeader(raw_ostream &OS) { OS << "/*===---- arm_sve.h - ARM SVE intrinsics " "-----------------------------------===\n" @@ -1294,27 +1337,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) { << To.Suffix << "(__VA_ARGS__)\n"; } - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority: - // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort( - Defs.begin(), Defs.end(), [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declarations. - for (auto &I : Defs) - I->emitIntrinsic(OS, *this); + createCoreHeaderIntrinsics(OS, *this, ACLEKind::SVE); OS << "#define svcvtnt_bf16_x svcvtnt_bf16_m\n"; OS << "#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m\n"; @@ -1496,30 +1519,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "extern \"C\" {\n"; OS << "#endif\n\n"; - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority similar to SVE: - // - Architectural guard - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort(Defs.begin(), Defs.end(), - [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), - (unsigned)I->getClassKind(), - I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declaration. - for (auto &I : Defs) { - I->emitIntrinsic(OS, *this); - } + createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME); OS << "#ifdef __cplusplus\n"; OS << "} // extern \"C\"\n"; -- Gitee From 0f09c3142ad80d1518061e490e3bb0b824615591 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Nov 2023 16:01:07 +0000 Subject: [PATCH 42/77] [AArch64] Cast predicate operand of SVE gather loads/scater stores to the parameter type of the intrinsic (NFC) (#71289) When emitting LLVM IR for gather loads/scatter stores, the predicate parameter is cast to a type that depends on the loaded, resp. stored type. That's correct for operation where we have a predicate per lane, however it is not correct for quadword loads and stores (`LD1Q`, `ST1Q`) where the predicate is per 128-bit chunk, independent from the ACLE intrinsic type. This can be universally handled by cast to the corresponding parameter type of the intrinsic. The intrinsic itself should be defined in a way that enforces relations between parameter types. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/CodeGen/CGBuiltin.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d15194ae50e1..af381c20de86 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9126,13 +9126,6 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags, auto *OverloadedTy = llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy); - // At the ACLE level there's only one predicate type, svbool_t, which is - // mapped to . However, this might be incompatible with the - // actual type being loaded. For example, when loading doubles (i64) the - // predicated should be instead. At the IR level the type of - // the predicate and the data being loaded must match. Cast accordingly. - Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy); - Function *F = nullptr; if (Ops[1]->getType()->isVectorTy()) // This is the "vector base, scalar offset" case. In order to uniquely @@ -9146,6 +9139,16 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags, // intrinsic. F = CGM.getIntrinsic(IntID, OverloadedTy); + // At the ACLE level there's only one predicate type, svbool_t, which is + // mapped to . However, this might be incompatible with the + // actual type being loaded. For example, when loading doubles (i64) the + // predicate should be instead. At the IR level the type of + // the predicate and the data being loaded must match. Cast to the type + // expected by the intrinsic. The intrinsic itself should be defined in + // a way than enforces relations between parameter types. + Ops[0] = EmitSVEPredicateCast( + Ops[0], cast(F->getArg(0)->getType())); + // Pass 0 when the offset is missing. This can only be applied when using // the "vector base" addressing mode for which ACLE allows no offset. The // corresponding LLVM IR always requires an offset. @@ -9210,8 +9213,11 @@ Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags, // mapped to . However, this might be incompatible with the // actual type being stored. For example, when storing doubles (i64) the // predicated should be instead. At the IR level the type of - // the predicate and the data being stored must match. Cast accordingly. - Ops[1] = EmitSVEPredicateCast(Ops[1], OverloadedTy); + // the predicate and the data being stored must match. Cast to the type + // expected by the intrinsic. The intrinsic itself should be defined in + // a way that enforces relations between parameter types. + Ops[1] = EmitSVEPredicateCast( + Ops[1], cast(F->getArg(1)->getType())); // For "vector base, scalar index" scale the index so that it becomes a // scalar offset. -- Gitee From 031e316d1ca34151c2b2a4a44b4441644cbe4442 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 20 Nov 2023 09:57:29 +0000 Subject: [PATCH 43/77] [AArch64][SME] Remove immediate argument restriction for svldr and svstr (#68565) The svldr_vnum and svstr_vnum builtins always modify the base register and tile slice and provide immediate offsets of zero, even when the offset provided to the builtin is an immediate. This patch optimises the output of the builtins when the offset is an immediate, to pass it directly to the instruction and to not need the base register and tile slice updates. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/CodeGen/CGBuiltin.cpp | 16 +- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 28 +-- .../aarch64-sme-intrinsics/acle_sme_str.c | 28 +-- llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +- .../Target/AArch64/AArch64ISelLowering.cpp | 90 ++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 + llvm/lib/Target/AArch64/SMEInstrFormats.td | 23 +- .../CodeGen/AArch64/sme-intrinsics-loads.ll | 196 ++++++++++++++++- .../CodeGen/AArch64/sme-intrinsics-stores.ll | 201 +++++++++++++++++- mlir/test/Target/LLVMIR/arm-sme.mlir | 2 +- 10 files changed, 535 insertions(+), 66 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index af381c20de86..07d6c20118e0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9513,18 +9513,10 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID) { - if (Ops.size() == 3) { - Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - - llvm::Value *VecNum = Ops[2]; - llvm::Value *MulVL = Builder.CreateMul(CntsbCall, VecNum, "mulvl"); - - Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); - Ops[0] = Builder.CreateAdd( - Ops[0], Builder.CreateIntCast(VecNum, Int32Ty, true), "tileslice"); - Ops.erase(&Ops[2]); - } + if (Ops.size() == 2) + Ops.push_back(Builder.getInt32(0)); + else + Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true); Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 3f8bb6a8cdfe..49f7854d355b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -8,7 +8,7 @@ // CHECK-C-LABEL: @test_svldr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { @@ -18,11 +18,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { // CHECK-C-LABEL: @test_svldr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] -// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { @@ -32,7 +28,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { // CHECK-C-LABEL: @test_svldr_za( // CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svldr_za(uint32_t slice_base, const void *ptr) { @@ -42,14 +38,20 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) { // CHECK-C-LABEL: @test_svldr_vnum_za_var( // CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] -// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM:%.*]] to i32 -// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE:%.*]] -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { svldr_vnum_za(slice_base, ptr, vnum); } + +// CHECK-C-LABEL: @test_svldr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) { + svldr_vnum_za(slice_base, ptr, 16); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 94c95b6664a0..aebc1d56be25 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -8,7 +8,7 @@ // CHECK-C-LABEL: @test_svstr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { @@ -18,11 +18,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { // CHECK-C-LABEL: @test_svstr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] -// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { @@ -32,7 +28,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { // CHECK-C-LABEL: @test_svstr_za( // CHECK-CXX-LABEL: @_Z13test_svstr_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svstr_za(uint32_t slice_base, void *ptr) { @@ -42,14 +38,20 @@ void test_svstr_za(uint32_t slice_base, void *ptr) { // CHECK-C-LABEL: @test_svstr_vnum_za_var( // CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]] -// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM:%.*]] to i32 -// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE:%.*]] -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { svstr_vnum_za(slice_base, ptr, vnum); } + +// CHECK-C-LABEL: @test_svstr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) { + svstr_vnum_za(slice_base, ptr, 16); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 557063c88132..21d4b9062206 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2679,10 +2679,10 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; // Spill + fill - def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; - def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + class SME_LDR_STR_ZA_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>; + def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic; + def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -3438,4 +3438,9 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic; def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; + class SME_LDR_STR_ZT_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>; + def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic; + def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic; + } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c0a670d3ded7..3f4991b05498 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2347,6 +2347,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::SME_ZA_LDR) + MAKE_CASE(AArch64ISD::SME_ZA_STR) MAKE_CASE(AArch64ISD::DUP) MAKE_CASE(AArch64ISD::DUPLANE8) MAKE_CASE(AArch64ISD::DUPLANE16) @@ -4785,6 +4787,90 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } +// Lower an SME LDR/STR ZA intrinsic +// Case 1: If the vector number (vecnum) is an immediate in range, it gets +// folded into the instruction +// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11] +// Case 2: If the vecnum is not an immediate, then it is used to modify the base +// and tile slice registers +// ldr(%tileslice, %ptr, %vecnum) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 0], [%ptr2, 0] +// Case 3: If the vecnum is an immediate out of range, then the same is done as +// case 2, but the base and slice registers are modified by the greatest +// multiple of 15 lower than the vecnum and the remainder is folded into the +// instruction. This means that successive loads and stores that are offset from +// each other can share the same base and slice register updates. +// ldr(%tileslice, %ptr, 22) +// ldr(%tileslice, %ptr, 23) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * 15 +// %tileslice2 = %tileslice + 15 +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 4: If the vecnum is an add of an immediate, then the non-immediate +// operand and the immediate can be folded into the instruction, like case 2. +// ldr(%tileslice, %ptr, %vecnum + 7) +// ldr(%tileslice, %ptr, %vecnum + 8) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 5: The vecnum being an add of an immediate out of range is also handled, +// in which case the same remainder logic as case 3 is used. +SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { + SDLoc DL(N); + + SDValue TileSlice = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue VecNum = N->getOperand(4); + int32_t ConstAddend = 0; + SDValue VarAddend = VecNum; + + // If the vnum is an add of an immediate, we can fold it into the instruction + if (VecNum.getOpcode() == ISD::ADD && + isa(VecNum.getOperand(1))) { + ConstAddend = cast(VecNum.getOperand(1))->getSExtValue(); + VarAddend = VecNum.getOperand(0); + } else if (auto ImmNode = dyn_cast(VecNum)) { + ConstAddend = ImmNode->getSExtValue(); + VarAddend = SDValue(); + } + + int32_t ImmAddend = ConstAddend % 16; + if (int32_t C = (ConstAddend - ImmAddend)) { + SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32); + VarAddend = VarAddend + ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal}) + : CVal; + } + + if (VarAddend) { + // Get the vector length that will be multiplied by vnum + auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + // Multiply SVL and vnum then add it to the base + SDValue Mul = DAG.getNode( + ISD::MUL, DL, MVT::i64, + {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)}); + Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); + // Just add vnum to the tileslice + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend}); + } + + return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, + DL, MVT::Other, + {/*Chain=*/N.getOperand(0), TileSlice, Base, + DAG.getTargetConstant(ImmAddend, DL, MVT::i32)}); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -4808,6 +4894,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); } + case Intrinsic::aarch64_sme_str: + case Intrinsic::aarch64_sme_ldr: { + return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr); + } case Intrinsic::aarch64_sme_za_enable: return DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index a6771921eada..5a0b9ea4b9f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -437,6 +437,10 @@ enum NodeType : unsigned { STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, + // SME ZA loads and stores + SME_ZA_LDR, + SME_ZA_STR, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 823115c7d025..1483c4b3c26d 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -33,6 +33,12 @@ def tileslicerange0s4 : ComplexPattern", []>; def am_sme_indexed_b4 :ComplexPattern", [], [SDNPWantRoot]>; +def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; +def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; +def AArch64SMEStr : SDNode<"AArch64ISD::SME_ZA_STR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; + //===----------------------------------------------------------------------===// // SME Pseudo Classes //===----------------------------------------------------------------------===// @@ -779,23 +785,23 @@ class sme_spill_inst : sme_spill_fill_base<0b1, (outs), (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; let mayLoad = 1 in class sme_fill_inst : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt), (ins MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; multiclass sme_spill { def NAME : sme_spill_inst; def : InstAlias(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; - // base - def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast(NAME) ZA, $idx, 0, $base, 0)>; + + def : Pat<(AArch64SMEStr (i32 MatrixIndexGPR32Op12_15:$slice), (i64 GPR64sp:$base), (i32 sme_elm_idx0_15:$imm)), + (!cast(NAME) ZA, MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base, imm32_0_15:$imm)>; } multiclass sme_fill { @@ -805,16 +811,15 @@ multiclass sme_fill { MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; def NAME # _PSEUDO : Pseudo<(outs), - (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4, GPR64sp:$base), []>, Sched<[]> { // Translated to actual instruction in AArch64ISelLowering.cpp let usesCustomInserter = 1; let mayLoad = 1; } - // base - def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast(NAME # _PSEUDO) $idx, 0, $base)>; + def : Pat<(AArch64SMELdr MatrixIndexGPR32Op12_15:$slice, GPR64sp:$base, sme_elm_idx0_15:$imm), + (!cast(NAME # _PSEUDO) MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index ccb3975f0c5b..a286ca4965e1 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -252,7 +252,7 @@ define void @ldr(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: ldr za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0) ret void; } @@ -264,7 +264,7 @@ define void @ldr_with_off_15(ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -278,7 +278,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -292,23 +292,205 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0) ret void; } +define void @ldr_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: ldr_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @ldr_with_off_15imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15) + ret void; +} + +define void @ldr_with_off_16imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, #32 // =0x20 +; CHECK-NEXT: add x8, x0, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) + ret void; +} + +define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: ldr za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_15_18: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 15], [x1, #15, mul vl] +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) + ret void +} + +define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_16_19: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19) + ret void +} + +define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm_31_34: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x9, x1, x8, lsl #4 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: ldr za[w12, 15], [x9, #15, mul vl] +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) + ret void +} + +define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_imm_32_35: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35) + ret void +} + +define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + +define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_var_high: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w2, #32 +; CHECK-NEXT: rdsvl x10, #1 +; CHECK-NEXT: sxtw x9, w8 +; CHECK-NEXT: add w12, w0, w8 +; CHECK-NEXT: madd x9, x10, x9, x1 +; CHECK-NEXT: ldr za[w12, 1], [x9, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x9, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x9, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + %1 = add i32 %0, 33 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 34 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 35 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + %4 = add i32 %0, 36 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4) + ret void +} + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_ld1_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB24_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB24_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -341,5 +523,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1d.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1q.vert(, ptr, i32, i32) -declare void @llvm.aarch64.sme.ldr(i32, ptr) +declare void @llvm.aarch64.sme.ldr(i32, ptr, i32) declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index ddff4c7d3cd3..36d72b7a7abb 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -252,7 +252,7 @@ define void @str(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: str za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.str(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0) ret void; } @@ -264,7 +264,7 @@ define void @str_with_off_15(ptr %ptr) { ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -278,7 +278,7 @@ define void @str_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -292,23 +292,210 @@ define void @str_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 16, ptr %base) + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0) ret void; } +define void @str_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: str_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @str_with_off_15imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #15 // =0xf +; CHECK-NEXT: add x8, x0, #15 +; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15) + ret void; +} + +define void @str_with_off_16imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, #31 // =0x1f +; CHECK-NEXT: add x8, x0, x8, lsl #4 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) + ret void; +} + +define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: str za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_15_18: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: str za[w12, 15], [x1, #15, mul vl] +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) + ret void +} + +define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_16_19: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19) + ret void +} + +define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_31_34: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #16 +; CHECK-NEXT: add w13, w0, #32 +; CHECK-NEXT: add x9, x1, x8, lsl #4 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: str za[w12, 15], [x9, #15, mul vl] +; CHECK-NEXT: str za[w13, 0], [x8] +; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) + ret void +} + +define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: str_with_off_many_imm_32_35: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w0, #32 +; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35) + ret void +} + +define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + +define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_var_high: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w8, w2, #32 +; CHECK-NEXT: rdsvl x10, #1 +; CHECK-NEXT: sxtw x9, w8 +; CHECK-NEXT: add w12, w0, w8 +; CHECK-NEXT: madd x9, x10, x9, x1 +; CHECK-NEXT: str za[w12, 1], [x9, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x9, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x9, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + %1 = add i32 %0, 33 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 34 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 35 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + %4 = add i32 %0, 36 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4) + ret void +} + + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB24_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB24_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -340,5 +527,5 @@ declare void @llvm.aarch64.sme.st1w.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1d.vert(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1q.vert(, ptr, i32, i32) -declare void @llvm.aarch64.sme.str(i32, ptr) +declare void @llvm.aarch64.sme.str(i32, ptr, i32) declare i64 @llvm.vscale.i64() diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir index 7beec1f61aa9..ebda250bb302 100644 --- a/mlir/test/Target/LLVMIR/arm-sme.mlir +++ b/mlir/test/Target/LLVMIR/arm-sme.mlir @@ -222,7 +222,7 @@ llvm.func @arm_sme_store(%nxv1i1 : vector<[1]xi1>, "arm_sme.intr.st1b.vert"(%nxv16i1, %p8, %c0, %c0) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> () // CHECK: call void @llvm.aarch64.sme.str - "arm_sme.intr.str"(%c0, %p8) : (i32, !llvm.ptr) -> () + "arm_sme.intr.str"(%c0, %ptr, %c0) : (i32, !llvm.ptr, i32) -> () llvm.return } -- Gitee From d334dd60a7513e6749808e62c37de8a18a9de3f2 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Thu, 30 Nov 2023 14:58:34 +0000 Subject: [PATCH 44/77] [AArch64] Warn when calling a NEON builtin in a streaming function (#73672) This patch introduces a warning that is emitted when a Neon builtin is called from a streaming function, as that situation is not supported. Uses work by Kerry McLaughlin. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../clang/Basic/DiagnosticSemaKinds.td | 3 ++ clang/lib/Sema/SemaChecking.cpp | 49 +++++++++++++++++++ .../Sema/aarch64-incompat-sm-builtin-calls.c | 22 +++++++++ 3 files changed, 74 insertions(+) create mode 100644 clang/test/Sema/aarch64-incompat-sm-builtin-calls.c diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a1f72cf14cbe..f30532464aa4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3101,6 +3101,9 @@ def err_attribute_bad_sve_vector_size : Error< def err_attribute_arm_feature_sve_bits_unsupported : Error< "%0 is only supported when '-msve-vector-bits=' is specified with a " "value of 128, 256, 512, 1024 or 2048.">; +def warn_attribute_arm_sm_incompat_builtin : Warning< + "builtin call has undefined behaviour when called from a %0 function">, + InGroup>; def err_sve_vector_in_non_sve_target : Error< "SVE vector type %0 cannot be used in a target without sve">; def err_attribute_riscv_rvv_bits_unsupported : Error< diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d9fe73ddfb71..b54f36799986 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2893,6 +2893,38 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context, llvm_unreachable("Invalid NeonTypeFlag!"); } +enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible }; + +static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { + if (FD->hasAttr()) + return ArmStreaming; + if (const auto *T = FD->getType()->getAs()) { + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } + return ArmNonStreaming; +} + +static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, + const FunctionDecl *FD, + ArmStreamingType BuiltinType) { + ArmStreamingType FnType = getArmStreamingFnType(FD); + + if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming"; + } + + if (FnType == ArmStreamingCompatible && + BuiltinType != ArmStreamingCompatible) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming compatible"; + return; + } +} + bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { // Range check SVE intrinsics that take immediate values. SmallVector, 3> ImmChecks; @@ -3031,6 +3063,23 @@ bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + + switch (BuiltinID) { + default: + break; +#define GET_NEON_BUILTINS +#define TARGET_BUILTIN(id, ...) case NEON::BI##id: +#define BUILTIN(id, ...) case NEON::BI##id: +#include "clang/Basic/arm_neon.inc" + checkArmStreamingBuiltin(*this, TheCall, FD, ArmNonStreaming); + break; +#undef TARGET_BUILTIN +#undef BUILTIN +#undef GET_NEON_BUILTINS + } + } + llvm::APSInt Result; uint64_t mask = 0; unsigned TV = 0; diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c new file mode 100644 index 000000000000..e77e09c44351 --- /dev/null +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -0,0 +1,22 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -target-feature +sme -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include "arm_neon.h" + +int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__arm_locally_streaming int16x8_t incompat_neon_ls(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} -- Gitee From 58443170efdb7002e638561e02cb30fd76fe6e71 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 1 Dec 2023 07:34:22 -0800 Subject: [PATCH 45/77] [AArch64][SME] Remove implicit-def's on smstart (#69012) When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 16 ++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 ++ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 1 + .../sme-streaming-compatible-interface.ll | 52 +++++++++++++++++++ .../AArch64/sme-streaming-interface.ll | 41 ++++++++++++--- 6 files changed, 108 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3f4991b05498..c814096a304f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7198,6 +7198,22 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { return ZExtBool; } +void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const { + // Live-in physreg copies that are glued to SMSTART are applied as + // implicit-def's in the InstrEmitter. Here we remove them, allowing the + // register allocator to pass call args in callee saved regs, without extra + // copies to avoid these fake clobbers of actually-preserved GPRs. + if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 || + MI.getOpcode() == AArch64::MSRpstatePseudo) + for (unsigned I = MI.getNumOperands() - 1; I > 0; --I) + if (MachineOperand &MO = MI.getOperand(I); + MO.isReg() && MO.isImplicit() && MO.isDef() && + (AArch64::GPR32RegClass.contains(MO.getReg()) || + AArch64::GPR64RegClass.contains(MO.getReg()))) + MI.removeOperand(I); +} + SDValue AArch64TargetLowering::changeStreamingMode( SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 5a0b9ea4b9f2..fc682484de54 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -960,6 +960,9 @@ private: const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; + void AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const override; + SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl &InVals) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 2685f2e3c810..daaf040656f3 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -203,7 +203,9 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), def MSRpstatePseudo : Pseudo<(outs), (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>, - Sched<[WriteSys]>; + Sched<[WriteSys]> { + let hasPostISelHook = 1; +} def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 1483c4b3c26d..835e74fdbc64 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -221,6 +221,7 @@ def MSRpstatesvcrImm1 let Inst{11-9} = pstatefield; let Inst{8} = imm; let Inst{7-5} = 0b011; // op2 + let hasPostISelHook = 1; } def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index e247de284f6d..7d7f6af8a641 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -437,5 +437,57 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ret void; } +define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: call_to_non_streaming_pass_args: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x1 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz w19, #0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB10_2: // %entry +; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov x1, x8 +; CHECK-NEXT: bl bar +; CHECK-NEXT: tbz w19, #0, .LBB10_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB10_4: // %entry +; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret +entry: + call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) + ret void +} + +declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef) attributes #0 = { nounwind "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index a338db9f8138..9de45c649caf 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: rdsvl x3, #1 +; CHECK-NEXT: addvl x0, sp, #2 +; CHECK-NEXT: addvl x1, sp, #1 +; CHECK-NEXT: mov x2, sp ; CHECK-NEXT: smstop sm -; CHECK-NEXT: mov x0, x9 -; CHECK-NEXT: mov x1, x10 -; CHECK-NEXT: mov x2, x11 -; CHECK-NEXT: mov x3, x8 ; CHECK-NEXT: bl foo ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ptrue p0.b @@ -400,9 +396,38 @@ entry: ret i8 %vecext } +define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 { +; CHECK-LABEL: call_to_non_streaming_pass_args: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: bl bar +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret +entry: + call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) + ret void +} + declare i64 @llvm.aarch64.sme.cntsb() declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef) +declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef) attributes #0 = { nounwind "target-features"="+sve" } attributes #1 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" } -- Gitee From a2d0e18e306149aebadae48bca9e15839d97bbcf Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 5 Dec 2023 13:20:57 +0000 Subject: [PATCH 46/77] [clang][NFC] Replace ARM_STREAMING_ATTR in tests with SME streaming attribute Some tests were testing SME builtins before the streaming attributes existed, and so either inserted them or not depending on a macro definition. The intention was for the macro to be defined once the attributes were added to clang, but it never was defined. This change removes the macro logic and adds the attribute now that they have been added to clang. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 32 ++++++++----------- .../acle_sme_ld1_vnum.c | 32 ++++++++----------- .../aarch64-sme-intrinsics/acle_sme_st1.c | 32 ++++++++----------- .../acle_sme_st1_vnum.c | 32 ++++++++----------- 4 files changed, 52 insertions(+), 76 deletions(-) diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index 57ed46995500..ae972731e6e9 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -1,16 +1,10 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s #include -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif - // CHECK-C-LABEL: @test_svld1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv( // CHECK-NEXT: entry: @@ -19,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_hor_za8(0, slice_base, pg, ptr); svld1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -33,7 +27,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, con // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_hor_za16(0, slice_base, pg, ptr); svld1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -47,7 +41,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_hor_za32(0, slice_base, pg, ptr); svld1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -61,7 +55,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_hor_za64(0, slice_base, pg, ptr); svld1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -74,7 +68,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_hor_za128(0, slice_base, pg, ptr); svld1_hor_za128(15, slice_base, pg, ptr); } @@ -87,7 +81,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, c // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_ver_za8(0, slice_base, pg, ptr); svld1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -101,7 +95,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, con // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_ver_za16(0, slice_base, pg, ptr); svld1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -115,7 +109,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_ver_za32(0, slice_base, pg, ptr); svld1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -129,7 +123,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_ver_za64(0, slice_base, pg, ptr); svld1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -142,7 +136,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, co // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { svld1_ver_za128(0, slice_base, pg, ptr); svld1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 5d61587d8557..84011615636e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -1,16 +1,10 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s #include -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif - // CHECK-C-LABEL: @test_svld1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl( // CHECK-NEXT: entry: @@ -22,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -39,7 +33,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -56,7 +50,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -73,7 +67,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -89,7 +83,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -105,7 +99,7 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -122,7 +116,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -139,7 +133,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -156,7 +150,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -172,7 +166,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index eec542341670..31708906f8c0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -1,16 +1,10 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s #include -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif - // CHECK-C-LABEL: @test_svst1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv( // CHECK-NEXT: entry: @@ -19,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_hor_za8(0, slice_base, pg, ptr); svst1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -33,7 +27,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, voi // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_hor_za16(0, slice_base, pg, ptr); svst1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -47,7 +41,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_hor_za32(0, slice_base, pg, ptr); svst1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -61,7 +55,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_hor_za64(0, slice_base, pg, ptr); svst1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -74,7 +68,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_hor_za128(0, slice_base, pg, ptr); svst1_hor_za128(15, slice_base, pg, ptr); } @@ -87,7 +81,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, v // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za8(0, slice_base, pg, ptr); svst1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -101,7 +95,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, voi // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za16(0, slice_base, pg, ptr); svst1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -115,7 +109,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za32(0, slice_base, pg, ptr); svst1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -129,7 +123,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za64(0, slice_base, pg, ptr); svst1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -142,7 +136,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, vo // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) { +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za128(0, slice_base, pg, ptr); svst1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index 81a2bba953b8..e6884739f6ba 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -1,16 +1,10 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s #include -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif - // CHECK-C-LABEL: @test_svst1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl( // CHECK-NEXT: entry: @@ -22,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -39,7 +33,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -56,7 +50,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -73,7 +67,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -89,7 +83,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -105,7 +99,7 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -122,7 +116,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -139,7 +133,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -156,7 +150,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -172,7 +166,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t p // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } -- Gitee From 489af90e671d8f7bbb310539d24b54865cd03fd4 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 6 Dec 2023 13:39:15 +0000 Subject: [PATCH 47/77] [Clang][NFC] Refactor out code from CheckSVEBuiltinFunctionCall into ParseSVEImmChecks This moves code from CheckSVEBuiltinFunctionCall into ParseSVEImmChecks in preparation for #74064 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Sema/Sema.h | 2 + clang/lib/Sema/SemaChecking.cpp | 97 +++++++++++++++++---------------- 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 588fdb15b2a0..d11c44bd45a2 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13614,6 +13614,8 @@ private: CallExpr *TheCall); bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool ParseSVEImmChecks(CallExpr *TheCall, + SmallVector, 3> &ImmChecks); bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg, diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index b54f36799986..af148d703413 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2895,58 +2895,15 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context, enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible }; -static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { - if (FD->hasAttr()) - return ArmStreaming; - if (const auto *T = FD->getType()->getAs()) { - if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) - return ArmStreaming; - if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) - return ArmStreamingCompatible; - } - return ArmNonStreaming; -} - -static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, - const FunctionDecl *FD, - ArmStreamingType BuiltinType) { - ArmStreamingType FnType = getArmStreamingFnType(FD); - - if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) { - S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) - << TheCall->getSourceRange() << "streaming"; - } - - if (FnType == ArmStreamingCompatible && - BuiltinType != ArmStreamingCompatible) { - S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) - << TheCall->getSourceRange() << "streaming compatible"; - return; - } -} - -bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { - // Range check SVE intrinsics that take immediate values. - SmallVector, 3> ImmChecks; - - switch (BuiltinID) { - default: - return false; -#define GET_SVE_IMMEDIATE_CHECK -#include "clang/Basic/arm_sve_sema_rangechecks.inc" -#undef GET_SVE_IMMEDIATE_CHECK -#define GET_SME_IMMEDIATE_CHECK -#include "clang/Basic/arm_sme_sema_rangechecks.inc" -#undef GET_SME_IMMEDIATE_CHECK - } - +bool Sema::ParseSVEImmChecks( + CallExpr *TheCall, SmallVector, 3> &ImmChecks) { // Perform all the immediate checks for this builtin call. bool HasError = false; for (auto &I : ImmChecks) { int ArgNum, CheckTy, ElementSizeInBits; std::tie(ArgNum, CheckTy, ElementSizeInBits) = I; - typedef bool(*OptionSetCheckFnTy)(int64_t Value); + typedef bool (*OptionSetCheckFnTy)(int64_t Value); // Function that checks whether the operand (ArgNum) is an immediate // that is one of the predefined values. @@ -3061,6 +3018,54 @@ bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return HasError; } +static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { + if (FD->hasAttr()) + return ArmStreaming; + if (const auto *T = FD->getType()->getAs()) { + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } + return ArmNonStreaming; +} + +static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, + const FunctionDecl *FD, + ArmStreamingType BuiltinType) { + ArmStreamingType FnType = getArmStreamingFnType(FD); + + if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming"; + } + + if (FnType == ArmStreamingCompatible && + BuiltinType != ArmStreamingCompatible) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming compatible"; + return; + } +} + +bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + // Range check SVE intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SVE_IMMEDIATE_CHECK +#include "clang/Basic/arm_sve_sema_rangechecks.inc" +#undef GET_SVE_IMMEDIATE_CHECK +#define GET_SME_IMMEDIATE_CHECK +#include "clang/Basic/arm_sme_sema_rangechecks.inc" +#undef GET_SME_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { if (const FunctionDecl *FD = getCurFunctionDecl()) { -- Gitee From 78b889c2172b0358c71bb0b0a490f808b36ce6b3 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 26 Jul 2023 06:19:54 +0000 Subject: [PATCH 48/77] [Clang][AArch64] Fix up immediate range of f64f64 mopa/mops intrinsics Reviewed By: bryanpkc Differential Revision: https://reviews.llvm.org/D156128 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 2 +- .../aarch64-sme-intrinsics/acle_sme_mopa-za64.c | 12 ++++++------ .../aarch64-sme-intrinsics/acle_sme_mops-za64.c | 12 ++++++------ .../Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp | 5 +++++ 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index ebee5a9bcd50..b5655afdf419 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -257,7 +257,7 @@ multiclass ZAFPOuterProd { def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d", MergeOp1, "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], - [ImmCheck<0, ImmCheck0_3>]>; + [ImmCheck<0, ImmCheck0_7>]>; } } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index 6925a450ba38..835d7c75ba6e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -18,11 +18,11 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { - SME_ACLE_FUNC(svmopa_za64, _s16, _m)(1, pn, pm, zn, zm); + SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm); } // CHECK-C-LABEL: @test_svmopa_za64_u16( @@ -42,11 +42,11 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { - SME_ACLE_FUNC(svmopa_za64, _f64, _m)(1, pn, pm, zn, zm); + SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm); } // CHECK-C-LABEL: @test_svsumopa_za64_s16( @@ -66,9 +66,9 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 2, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { - SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(2, pn, pm, zn, zm); + SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index 447bca055ad4..ea1e55001b65 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -18,11 +18,11 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { - SME_ACLE_FUNC(svmops_za64, _s16, _m)(1, pn, pm, zn, zm); + SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm); } // CHECK-C-LABEL: @test_svmops_za64_u16( @@ -42,11 +42,11 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { - SME_ACLE_FUNC(svmops_za64, _f64, _m)(1, pn, pm, zn, zm); + SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm); } // CHECK-C-LABEL: @test_svsumops_za64_s16( @@ -66,9 +66,9 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 2, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { - SME_ACLE_FUNC(svusmops_za64, _u16, _m)(2, pn, pm, zn, zm); + SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 68395a39b878..1faa5638c801 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -126,6 +126,11 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svusmopa_za64, _u16, _m,)(8, pg, pg, svundef_u16(), svundef_s16()); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svusmops_za64, _u16, _m,)(-1, pg, pg, svundef_u16(), svundef_s16()); + + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svmopa_za64, _f64, _m,)(8, pg, pg, svundef_f64(), svundef_f64()); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64()); } void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { -- Gitee From a0a88eda4ddbef8b1027a280540f739d6489d671 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 18 Dec 2023 09:32:34 +0000 Subject: [PATCH 49/77] [AArch64][SME] Warn when using a streaming builtin from a non-streaming function (#75487) This PR adds a warning that's emitted when a non-streaming or non-streaming-compatible builtin is called in an unsuitable function. Uses work by Kerry McLaughlin. This is a re-upload of #74064 and fixes a compile time increase. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/CMakeLists.txt | 6 + clang/include/clang/Basic/arm_sve.td | 1164 ++++++++--------- clang/include/clang/Sema/Sema.h | 1 + clang/lib/Sema/SemaChecking.cpp | 51 +- .../aarch64-sme-intrinsics/acle_sme_add-i32.c | 16 +- .../aarch64-sme-intrinsics/acle_sme_add-i64.c | 16 +- .../acle_sme_mopa-za32.c | 14 +- .../acle_sme_mopa-za64.c | 10 +- .../acle_sme_mops-za32.c | 14 +- .../acle_sme_mops-za64.c | 10 +- .../aarch64-sme-intrinsics/acle_sme_read.c | 192 +-- .../aarch64-sme-intrinsics/acle_sme_write.c | 192 +-- .../Sema/aarch64-incompat-sm-builtin-calls.c | 77 ++ .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 14 +- .../aarch64-sme-intrinsics/acle_sme_target.c | 9 +- clang/utils/TableGen/NeonEmitter.cpp | 28 + clang/utils/TableGen/SveEmitter.cpp | 56 + clang/utils/TableGen/TableGen.cpp | 12 + clang/utils/TableGen/TableGenBackends.h | 2 + 19 files changed, 1055 insertions(+), 829 deletions(-) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 691d14f013cf..c5ccb641ca80 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -96,6 +96,9 @@ clang_tablegen(arm_sve_sema_rangechecks.inc -gen-arm-sve-sema-rangechecks ${CLANG_BASIC_OPTIONS} SOURCE arm_sve.td TARGET ClangARMSveSemaRangeChecks) +clang_tablegen(arm_sve_streaming_attrs.inc -gen-arm-sve-streaming-attrs + SOURCE arm_sve.td + TARGET ClangARMSveStreamingAttrs) clang_tablegen(arm_sme_builtins.inc -gen-arm-sme-builtins SOURCE arm_sme.td TARGET ClangARMSmeBuiltins) @@ -105,6 +108,9 @@ clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks SOURCE arm_sme.td TARGET ClangARMSmeSemaRangeChecks) +clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs + SOURCE arm_sme.td + TARGET ClangARMSmeStreamingAttrs) clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def SOURCE arm_cde.td TARGET ClangARMCdeBuiltinsDef) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 401811688990..758a4fe84dae 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -19,27 +19,27 @@ include "arm_sve_sme_incl.td" // Loads // Load one vector (scalar base) -def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; let TargetGuard = "sve,bf16" in { - def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; - def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; } // Load one vector (scalar base, VL displacement) -def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; // Load one vector (vector base) def SVLD1_GATHER_BASES_U : MInst<"svld1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">; @@ -243,27 +243,27 @@ let TargetGuard = "sve,bf16" in { } // Load one vector, unextended load, non-temporal (scalar base) -def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; // Load one vector, unextended load, non-temporal (scalar base, VL displacement) -def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; let TargetGuard = "sve,bf16" in { - def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; - def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; } // Load one quadword and replicate (scalar base) -def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">; + def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; } multiclass StructLoad { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { - def: SInst; + def: SInst; } } @@ -286,42 +286,42 @@ let TargetGuard = "sve,f64mm,bf16" in { } let TargetGuard = "sve,bf16" in { - def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; - def SVBFMMLA : SInst<"svbfmmla[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmmla", [IsOverloadNone]>; - def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; - def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; - def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; - def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMMLA : SInst<"svbfmmla[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmmla", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; } //////////////////////////////////////////////////////////////////////////////// // Stores // Store one vector (scalar base) -def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; // Store one vector (scalar base, VL displacement) -def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; let TargetGuard = "sve,bf16" in { - def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; - def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; } // Store one vector (vector base) @@ -394,9 +394,9 @@ def SVST1H_SCATTER_INDEX_S : MInst<"svst1h_scatter[_{2}base]_index[_{d}]", "v def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">; multiclass StructStore { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { - def: SInst; + def: SInst; } } // Store N vectors into N-element structure (scalar base) @@ -410,30 +410,30 @@ defm SVST3_VNUM : StructStore<"svst3_vnum[_{d}]", "vPpl3", "aarch64_sve_st3">; defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">; // Store one vector, with no truncation, non-temporal (scalar base) -def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; // Store one vector, with no truncation, non-temporal (scalar base, VL displacement) -def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; let TargetGuard = "sve,bf16" in { - def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; - def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; } //////////////////////////////////////////////////////////////////////////////// // Prefetches // Prefetch (Scalar base) -def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Scalar base, VL displacement) -def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Vector bases) def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8, "aarch64_sve_prfb_gather_scalar_offset">; @@ -488,9 +488,9 @@ def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss", "iUif", MergeNone>; def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone>; multiclass svdup_base { - def NAME : SInst; + def NAME : SInst; let TargetGuard = "sve,bf16" in { - def _BF16: SInst; + def _BF16: SInst; } } @@ -499,14 +499,14 @@ defm SVDUP_M : svdup_base<"svdup[_n]_{d}", "ddPs", MergeOp1, "aarch64_sve_du defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs", MergeAnyExp, "aarch64_sve_dup">; defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs", MergeZeroExp, "aarch64_sve_dup">; -def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index">; +def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index", [IsStreamingCompatible]>; // Integer arithmetic -multiclass SInstZPZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; } defm SVABS : SInstZPZ<"svabs", "csil", "aarch64_sve_abs">; @@ -515,13 +515,13 @@ defm SVNEG : SInstZPZ<"svneg", "csil", "aarch64_sve_neg">; //------------------------------------------------------------------------------ multiclass SInstZPZZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; } defm SVABD_S : SInstZPZZ<"svabd", "csil", "aarch64_sve_sabd", "aarch64_sve_sabd_u">; @@ -553,26 +553,26 @@ multiclass SInstZPZZZ; } -defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp]>; -defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u">; -defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u">; -defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp]>; +defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>; +defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u", [IsStreamingCompatible]>; +defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u", [IsStreamingCompatible]>; +defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>; //------------------------------------------------------------------------------ -def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; @@ -592,107 +592,107 @@ defm SVNOT : SInstZPZ<"svnot", "csilUcUsUiUl", "aarch64_sve_not">; // Shifts multiclass SInst_SHIFT { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; - def _WIDE_M : SInst; - def _WIDE_X : SInst; - def _WIDE_Z : SInst; + def _WIDE_M : SInst; + def _WIDE_X : SInst; + def _WIDE_Z : SInst; - def _WIDE_N_M : SInst; - def _WIDE_N_X : SInst; - def _WIDE_N_Z : SInst; + def _WIDE_N_M : SInst; + def _WIDE_N_X : SInst; + def _WIDE_N_Z : SInst; } defm SVASR : SInst_SHIFT<"svasr", "aarch64_sve_asr", "csil", "csi">; defm SVLSL : SInst_SHIFT<"svlsl", "aarch64_sve_lsl", "csilUcUsUiUl", "csiUcUsUi">; defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">; -def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">; +def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr">; + def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Integer reductions -def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv">; -def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv">; -def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv">; -def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv">; -def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv">; -def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv">; -def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv">; -def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv">; -def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv">; +def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv", [IsStreamingCompatible]>; +def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv", [IsStreamingCompatible]>; +def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv", [IsStreamingCompatible]>; +def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv", [IsStreamingCompatible]>; +def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv", [IsStreamingCompatible]>; +def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv", [IsStreamingCompatible]>; +def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv", [IsStreamingCompatible]>; +def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv", [IsStreamingCompatible]>; +def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Integer comparisons -def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; -def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; - -def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; -def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; - -def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; - -def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; +def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; + +def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // While comparisons -def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; -def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; +def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Counting bit @@ -703,12 +703,12 @@ multiclass SInstCLS def _Z : SInst; } -defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls">; -defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz">; -defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt">; +defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls", [IsStreamingCompatible]>; +defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz", [IsStreamingCompatible]>; +defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt">; + defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -763,13 +763,13 @@ def SVTMAD : SInst<"svtmad[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_ftma def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftsmul_x">; def SVTSSEL : SInst<"svtssel[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftssel_x">; -def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale">; +def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; -def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale">; +def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad", "aarch64_sve_fmla_u", [ReverseMergeAnyAccOp]>; defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla", "aarch64_sve_fmla_u">; @@ -780,42 +780,42 @@ defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fn defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u">; defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [ReverseMergeAnyAccOp]>; -def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x">; -def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x">; -def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x">; -def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x">; +def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x", [IsStreamingCompatible]>; +def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x", [IsStreamingCompatible]>; +def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x", [IsStreamingCompatible]>; +def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point reductions -def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">; -def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv">; -def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv">; -def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv">; -def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv">; -def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv">; +def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda", [IsStreamingCompatible]>; +def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv", [IsStreamingCompatible]>; +def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv", [IsStreamingCompatible]>; +def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv", [IsStreamingCompatible]>; +def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv", [IsStreamingCompatible]>; +def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point comparisons -def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo">; +def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge">; def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt">; @@ -823,19 +823,19 @@ def SVACLE_N : SInst<"svacle[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_ def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">; -def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; -def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point conversions @@ -843,16 +843,16 @@ def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sv multiclass SInstCvtMXZ< string name, string m_types, string xz_types, string types, string intrinsic, list flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; } multiclass SInstCvtMX flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; + def _M : SInst; + def _X : SInst; } // svcvt_s##_f16 @@ -866,7 +866,7 @@ defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l", "aar let TargetGuard = "sve,bf16" in { defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_fcvt_bf16f32">; - def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone]>; + def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, IsStreamingCompatible]>; } // svcvt_s##_f64 @@ -930,11 +930,11 @@ defm SVCVTLT_F64 : SInstCvtMX<"svcvtlt_f64[_f32]", "ddPh", "dPh", "d", "aarc defm SVCVTX_F32 : SInstCvtMXZ<"svcvtx_f32[_f64]", "MMPd", "MPd", "d", "aarch64_sve_fcvtx_f32f64">; -def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone]>; -def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone]>; +def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone, IsStreamingCompatible]>; +def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTNT_X : Implemented as macro by SveEmitter.cpp -def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone]>; +def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTXNT_X_F32 : Implemented as macro by SveEmitter.cpp } @@ -943,9 +943,9 @@ def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch6 // Permutations and selection multiclass SVEPerm { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { - def: SInst; + def: SInst; } } @@ -969,81 +969,81 @@ def SVDUPQ_LANE : SInst<"svdupq_lane[_{d}]", "ddn", "csilUcUsUiUlhfd", MergeNo let TargetGuard = "sve,bf16" in { def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane">; } -def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; defm SVLASTA : SVEPerm<"svlasta[_{d}]", "sPd", "aarch64_sve_lasta">; defm SVLASTB : SVEPerm<"svlastb[_{d}]", "sPd", "aarch64_sve_lastb">; -def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">; -def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">; -def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">; -def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl">; + def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1">; -def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2">; -def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi">; -def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi">; -def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo">; -def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo">; -def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1">; -def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2">; +def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi", [IsStreamingCompatible]>; +def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi", [IsStreamingCompatible]>; +def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo", [IsStreamingCompatible]>; +def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo", [IsStreamingCompatible]>; +def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; -def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev">; -def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel">; -def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice">; -def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1">; -def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2">; -def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1">; -def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2">; -} - -def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev">; -def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone]>; -def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone]>; -def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone]>; -def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel">; -def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1">; -def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone]>; -def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone]>; -def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone]>; -def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2">; -def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone]>; -def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone]>; -def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone]>; -def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi">; -def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo">; -def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1">; -def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone]>; -def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone]>; -def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone]>; -def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2">; -def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone]>; -def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone]>; -def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone]>; -def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1">; -def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone]>; -def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone]>; -def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone]>; -def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2">; -def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone]>; -def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone]>; -def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone]>; +def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +} + +def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi", [IsStreamingCompatible]>; +def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo", [IsStreamingCompatible]>; +def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate creation -def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone]>; +def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; -def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">; -def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>; +def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>; +def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL, IsStreamingCompatible]>; def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone>; def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone>; @@ -1055,33 +1055,33 @@ def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone>; //////////////////////////////////////////////////////////////////////////////// // Predicate operations -def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z">; -def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z">; -def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z">; -def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z">; -def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z">; -def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z">; -def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z">; - -def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka">; -def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z">; -def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb">; -def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z">; -def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z">; -def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z">; -def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z">; - -def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst">; -def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext">; +def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [IsStreamingCompatible]>; +def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [IsStreamingCompatible]>; +def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [IsStreamingCompatible]>; +def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z", [IsStreamingCompatible]>; +def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z", [IsStreamingCompatible]>; +def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z", [IsStreamingCompatible]>; +def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z", [IsStreamingCompatible]>; + +def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka", [IsStreamingCompatible]>; +def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z", [IsStreamingCompatible]>; +def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb", [IsStreamingCompatible]>; +def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z", [IsStreamingCompatible]>; +def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z", [IsStreamingCompatible]>; +def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z", [IsStreamingCompatible]>; +def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z", [IsStreamingCompatible]>; + +def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst", [IsStreamingCompatible]>; +def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Testing predicates -def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any">; -def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first">; -def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last">; +def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any", [IsStreamingCompatible]>; +def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first", [IsStreamingCompatible]>; +def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // FFR manipulation @@ -1094,21 +1094,21 @@ def SVWRFFR : SInst<"svwrffr", "vP", "Pc", MergeNone, "", [IsOverloadNone]>; //////////////////////////////////////////////////////////////////////////////// // Counting elements -def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone]>; -def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone]>; -def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone]>; -def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone]>; +def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone, IsStreamingCompatible]>; -def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone]>; +def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; -def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp">; -def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone>; +def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp", [IsStreamingCompatible]>; +def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone>; +def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1125,20 +1125,20 @@ def UnsignedWord : sat_type<"U", "Ui">; def UnsignedDoubleWord : sat_type<"U", "Ul">; multiclass SInst_SAT1 { - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } multiclass SInst_SAT2 { - def "" : SInst]>; - def _ALL : SInst]>; + def "" : SInst]>; + def _ALL : SInst]>; - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } defm SVQDECB_S : SInst_SAT1<"svqdecb", "aarch64_sve_sqdecb", SignedByte>; @@ -1159,32 +1159,32 @@ defm SVQINCW_U : SInst_SAT2<"svqincw", "aarch64_sve_uqincw", UnsignedWord>; defm SVQINCD_S : SInst_SAT2<"svqincd", "aarch64_sve_sqincd", SignedDoubleWord>; defm SVQINCD_U : SInst_SAT2<"svqincd", "aarch64_sve_uqincd", UnsignedDoubleWord>; -def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp">; -def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp">; -def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp">; -def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp">; +def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp", [IsStreamingCompatible]>; +def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp", [IsStreamingCompatible]>; +def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp", [IsStreamingCompatible]>; +def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp", [IsStreamingCompatible]>; -def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32">; -def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64">; -def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32">; -def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64">; -def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32">; -def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64">; -def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32">; -def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64">; +def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64", [IsStreamingCompatible]>; +def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [IsStreamingCompatible]>; let TargetGuard = "sve,i8mm" in { def SVMLLA_S32 : SInst<"svmmla[_s32]", "ddqq","i", MergeNone, "aarch64_sve_smmla">; def SVMLLA_U32 : SInst<"svmmla[_u32]", "ddqq","Ui", MergeNone, "aarch64_sve_ummla">; def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i", MergeNone, "aarch64_sve_usmmla">; -def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot">; -def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot">; -def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; -def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; +def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; +def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; -def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } let TargetGuard = "sve,f32mm" in { @@ -1193,12 +1193,12 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla let TargetGuard = "sve,f64mm" in { def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">; -def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">; -def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">; +def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q", [IsStreamingCompatible]>; +def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q", [IsStreamingCompatible]>; +def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } let TargetGuard = "sve,bf16,f64mm" in { @@ -1212,20 +1212,20 @@ def SVZIP2Q_BF16 : SInst<"svzip2q[_{d}]", "ddd", "b", MergeNone, "aarc //////////////////////////////////////////////////////////////////////////////// // Vector creation -def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; +def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; let TargetGuard = "sve,bf16" in { -def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef]>; +def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate]>; def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate]>; @@ -1255,14 +1255,14 @@ def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet //////////////////////////////////////////////////////////////////////////////// // SVE2 WhileGE/GT let TargetGuard = "sve2" in { -def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; -def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; +def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1304,49 +1304,49 @@ multiclass SInstZPZxZ; -defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl">; -defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl">; -defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl">; -defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl">; -defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl">; -defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd">; -defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd">; - -def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba">; -def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba">; -def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; - -def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl", [IsStreamingCompatible]>; +defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [IsStreamingCompatible]>; +defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl", [IsStreamingCompatible]>; +defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl", [IsStreamingCompatible]>; +defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl", [IsStreamingCompatible]>; +defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl", [IsStreamingCompatible]>; +defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd", [IsStreamingCompatible]>; +defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd", [IsStreamingCompatible]>; + +def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + +def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1358,29 +1358,29 @@ multiclass SInstPairwise; -defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp">; -defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp">; -defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp">; -defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp">; -defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp">; -defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp">; -defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp">; -defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp">; -defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp">; +defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp", [IsStreamingCompatible]>; +defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp", [IsStreamingCompatible]>; +defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp", [IsStreamingCompatible]>; +defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp", [IsStreamingCompatible]>; +defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp", [IsStreamingCompatible]>; +defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp", [IsStreamingCompatible]>; +defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp", [IsStreamingCompatible]>; +defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp", [IsStreamingCompatible]>; +defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp", [IsStreamingCompatible]>; +defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Widening pairwise arithmetic let TargetGuard = "sve2" in { -def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp">; -def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp">; -def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp">; +def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp", [IsStreamingCompatible]>; -def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp">; -def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp">; -def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp">; +def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1388,56 +1388,56 @@ def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_s // let TargetGuard = "sve2" in { -def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; - -def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; -def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; + +def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; +def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Large integer arithmetic let TargetGuard = "sve2" in { -def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; -def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Multiplication by indexed elements let TargetGuard = "sve2" in { -def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Uniform complex integer arithmetic let TargetGuard = "sve2" in { -def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; } @@ -1445,18 +1445,18 @@ def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", // SVE2 - Widening DSP operations multiclass SInstWideDSPAcc { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPLong { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPWide { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } let TargetGuard = "sve2" in { @@ -1505,87 +1505,87 @@ defm SVSUBWB_U : SInstWideDSPWide<"svsubwb", "UsUiUl", "aarch64_sve_usubwb">; defm SVSUBWT_S : SInstWideDSPWide<"svsubwt", "sil", "aarch64_sve_ssubwt">; defm SVSUBWT_U : SInstWideDSPWide<"svsubwt", "UsUiUl", "aarch64_sve_usubwt">; -def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; - -def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone>; -def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone>; - -def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; + +def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; + +def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Narrowing DSP operations let TargetGuard = "sve2" in { -def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; - -def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; + +def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Unary narrowing operations let TargetGuard = "sve2" in { -def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb">; -def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb">; -def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb">; +def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb", [IsStreamingCompatible]>; +def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb", [IsStreamingCompatible]>; +def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb", [IsStreamingCompatible]>; -def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt">; -def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt">; -def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt">; +def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt", [IsStreamingCompatible]>; +def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt", [IsStreamingCompatible]>; +def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1726,18 +1726,18 @@ def SVSTNT1W_SCATTER_INDEX_S : MInst<"svstnt1w_scatter[_{2}base]_index[_{d}]", " // SVE2 - Polynomial arithmetic let TargetGuard = "sve2" in { -def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone>; +def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; } @@ -1746,8 +1746,8 @@ def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", Mer // SVE2 - Complex integer dot product let TargetGuard = "sve2" in { -def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [], [ImmCheck<4, ImmCheckComplexRotAll90>, +def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>, ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } @@ -1755,27 +1755,27 @@ def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch // SVE2 - Floating-point widening multiply-accumulate let TargetGuard = "sve2" in { -def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Floating-point integer binary logarithm let TargetGuard = "sve2" in { -def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb">; -def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb">; -def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb">; +def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1797,32 +1797,32 @@ def SVNMATCH : SInst<"svnmatch[_{d}]", "PPdd", "csUcUs", MergeNone, "aarch64_sve //////////////////////////////////////////////////////////////////////////////// // SVE2 - Contiguous conflict detection let TargetGuard = "sve2" in { -def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW]>; -def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW]>; -def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW]>; +def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, IsStreamingCompatible]>; -def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW]>; -def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; -def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW]>; -def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW]>; +def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; +def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Extended table lookup/permute let TargetGuard = "sve2" in { -def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone>; -def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx">; +def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone>; -def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx">; +def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d11c44bd45a2..2c6085b1b223 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13616,6 +13616,7 @@ private: bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool ParseSVEImmChecks(CallExpr *TheCall, SmallVector, 3> &ImmChecks); + bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg, diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index af148d703413..efc1aeea42d2 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3034,7 +3034,6 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, const FunctionDecl *FD, ArmStreamingType BuiltinType) { ArmStreamingType FnType = getArmStreamingFnType(FD); - if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) { S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) << TheCall->getSourceRange() << "streaming"; @@ -3046,9 +3045,53 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, << TheCall->getSourceRange() << "streaming compatible"; return; } + + if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "non-streaming"; + } +} + +bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + std::optional BuiltinType; + + switch (BuiltinID) { +#define GET_SME_STREAMING_ATTRS +#include "clang/Basic/arm_sme_streaming_attrs.inc" +#undef GET_SME_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + } + + // Range check SME intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SME_IMMEDIATE_CHECK +#include "clang/Basic/arm_sme_sema_rangechecks.inc" +#undef GET_SME_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); } bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + std::optional BuiltinType; + + switch (BuiltinID) { +#define GET_SVE_STREAMING_ATTRS +#include "clang/Basic/arm_sve_streaming_attrs.inc" +#undef GET_SVE_STREAMING_ATTRS + } + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + } // Range check SVE intrinsics that take immediate values. SmallVector, 3> ImmChecks; @@ -3058,9 +3101,6 @@ bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { #define GET_SVE_IMMEDIATE_CHECK #include "clang/Basic/arm_sve_sema_rangechecks.inc" #undef GET_SVE_IMMEDIATE_CHECK -#define GET_SME_IMMEDIATE_CHECK -#include "clang/Basic/arm_sme_sema_rangechecks.inc" -#undef GET_SME_IMMEDIATE_CHECK } return ParseSVEImmChecks(TheCall, ImmChecks); @@ -3447,6 +3487,9 @@ bool Sema::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI, if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall)) return true; + if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall)) + return true; + // For intrinsics which take an immediate value as part of the instruction, // range check them here. unsigned i = 0, l = 0, u = 0; diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index b0855553df79..cf5de1f0526d 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) { +void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) { +void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) { +void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) { +void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index 2f0f97e742e3..951262620965 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) { +void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) { +void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) { +void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) { +void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index b52aee12f9c7..4c0debe9a3a4 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { +void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming { SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { +void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming { SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index 835d7c75ba6e..4c91281f31c6 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming { SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { +void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming { SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { +void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming { SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index 923b6b96b4b4..68d0071e4af4 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming { SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming { SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index ea1e55001b65..d2852b0ee563 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming { SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming { SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming { SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index f7a0852387e8..28a385e111f5 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -19,7 +19,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -30,7 +30,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice); } @@ -42,7 +42,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -54,7 +54,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice); } @@ -66,7 +66,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -78,7 +78,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice); } @@ -90,7 +90,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -102,7 +102,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice); } @@ -113,7 +113,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -124,7 +124,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice); } @@ -136,7 +136,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -148,7 +148,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice); } @@ -160,7 +160,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -172,7 +172,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice); } @@ -184,7 +184,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -196,7 +196,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice); } @@ -208,7 +208,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -220,7 +220,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice); } @@ -232,7 +232,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -244,7 +244,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -256,7 +256,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -268,7 +268,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice); } @@ -280,7 +280,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -292,7 +292,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice); } @@ -303,7 +303,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -313,7 +313,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -324,7 +324,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -335,7 +335,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -346,7 +346,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -357,7 +357,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -368,7 +368,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -379,7 +379,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -389,7 +389,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -399,7 +399,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -410,7 +410,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -421,7 +421,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -432,7 +432,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -443,7 +443,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -454,7 +454,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -465,7 +465,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -476,7 +476,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -487,7 +487,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -498,7 +498,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -509,7 +509,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -520,7 +520,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -531,7 +531,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -542,7 +542,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -553,7 +553,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base); } @@ -563,7 +563,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -574,7 +574,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice); } @@ -586,7 +586,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -598,7 +598,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice); } @@ -610,7 +610,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -622,7 +622,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice); } @@ -634,7 +634,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -646,7 +646,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice); } @@ -657,7 +657,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -668,7 +668,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice); } @@ -680,7 +680,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -692,7 +692,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice); } @@ -704,7 +704,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -716,7 +716,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice); } @@ -728,7 +728,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -740,7 +740,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice); } @@ -752,7 +752,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -764,7 +764,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice); } @@ -776,7 +776,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -788,7 +788,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -800,7 +800,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -812,7 +812,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice); } @@ -824,7 +824,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -836,7 +836,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice); } @@ -847,7 +847,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -857,7 +857,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) { +svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -868,7 +868,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -879,7 +879,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) { +svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -890,7 +890,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -901,7 +901,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) { +svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -912,7 +912,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -923,7 +923,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) { +svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -933,7 +933,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -943,7 +943,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) { +svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -954,7 +954,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -965,7 +965,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) { +svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -976,7 +976,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -987,7 +987,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) { +svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -998,7 +998,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -1009,7 +1009,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) { +svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -1020,7 +1020,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -1031,7 +1031,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -1042,7 +1042,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -1053,7 +1053,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { +svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -1064,7 +1064,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -1075,7 +1075,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { +svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -1086,7 +1086,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -1097,6 +1097,6 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { +svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index 395918b936b3..f574eec13ecb 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -30,7 +30,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn); } @@ -42,7 +42,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -54,7 +54,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn); } @@ -66,7 +66,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -78,7 +78,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn); } @@ -90,7 +90,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -102,7 +102,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn); } @@ -113,7 +113,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -124,7 +124,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn); } @@ -136,7 +136,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -148,7 +148,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn); } @@ -160,7 +160,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -172,7 +172,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn); } @@ -184,7 +184,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -196,7 +196,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn); } @@ -208,7 +208,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -220,7 +220,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn); } @@ -232,7 +232,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -244,7 +244,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn); } @@ -256,7 +256,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -268,7 +268,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn); } @@ -280,7 +280,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -292,7 +292,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn); } @@ -303,7 +303,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -313,7 +313,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -324,7 +324,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -335,7 +335,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -346,7 +346,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -357,7 +357,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -368,7 +368,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -379,7 +379,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -389,7 +389,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -399,7 +399,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -410,7 +410,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -421,7 +421,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -432,7 +432,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -443,7 +443,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -454,7 +454,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -465,7 +465,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -476,7 +476,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -487,7 +487,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -498,7 +498,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -509,7 +509,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -520,7 +520,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -531,7 +531,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -542,7 +542,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -553,7 +553,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn); } @@ -563,7 +563,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -574,7 +574,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn); } @@ -586,7 +586,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -598,7 +598,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn); } @@ -610,7 +610,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -622,7 +622,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn); } @@ -634,7 +634,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -646,7 +646,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn); } @@ -657,7 +657,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -668,7 +668,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn); } @@ -680,7 +680,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -692,7 +692,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn); } @@ -704,7 +704,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -716,7 +716,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn); } @@ -728,7 +728,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -740,7 +740,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn); } @@ -752,7 +752,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -764,7 +764,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn); } @@ -776,7 +776,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -788,7 +788,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn); } @@ -800,7 +800,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -812,7 +812,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn); } @@ -824,7 +824,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -836,7 +836,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn); } @@ -847,7 +847,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -857,7 +857,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) { +void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -868,7 +868,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -879,7 +879,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) { +void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -890,7 +890,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -901,7 +901,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) { +void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -912,7 +912,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -923,7 +923,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) { +void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -933,7 +933,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -943,7 +943,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) { +void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -954,7 +954,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -965,7 +965,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) { +void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -976,7 +976,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -987,7 +987,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) { +void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -998,7 +998,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -1009,7 +1009,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) { +void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -1020,7 +1020,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -1031,7 +1031,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { +void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -1042,7 +1042,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -1053,7 +1053,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { +void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -1064,7 +1064,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -1075,7 +1075,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { +void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -1086,7 +1086,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -1097,6 +1097,6 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { +void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn); } diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index e77e09c44351..361a9e82a3ad 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -5,6 +5,8 @@ // REQUIRES: aarch64-registered-target #include "arm_neon.h" +#include "arm_sme_draft_spec_subject_to_change.h" +#include "arm_sve.h" int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} @@ -20,3 +22,78 @@ int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); } + +void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_shared_za { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr); +} + +svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__arm_locally_streaming svuint32_t incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +svuint32_t incompat_sve_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming_compatible { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +svuint32_t incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__arm_locally_streaming svuint32_t incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +svuint32_t incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming_compatible { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_shared_za { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + svmops_za32_f32_m(0, pn, pm, zn, zm); +} + +svfloat64_t streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_streaming { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__arm_locally_streaming svfloat64_t locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +svfloat64_t streaming_compatible_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_streaming_compatible { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +svint16_t streaming_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__arm_locally_streaming svint16_t locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +svint16_t streaming_compatible_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming_compatible { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +svbool_t streaming_caller_ptrue(void) __arm_streaming { + // expected-no-warning + return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4)); +} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 1faa5638c801..47c7210206b0 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -12,7 +12,7 @@ #include -void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) { +void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} @@ -32,7 +32,7 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8()); } -void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) { +void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} @@ -52,7 +52,7 @@ void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16()); } -void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) { +void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} @@ -90,7 +90,7 @@ void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8()); } -void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { +void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} @@ -133,7 +133,7 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64()); } -void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { +void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} @@ -153,14 +153,14 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8()); } -void test_range_0_255(svbool_t pg, void *ptr) { +void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming { // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(256); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(-1); } -void test_constant(uint64_t u64, svbool_t pg, void *ptr) { +void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}} SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c index 2de6d9f6877f..7cfe9fdfbd24 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -6,20 +6,21 @@ #include __attribute__((target("sme"))) -void test_sme(svbool_t pg, void *ptr) { +void test_sme(svbool_t pg, void *ptr) __arm_streaming { svld1_hor_za8(0, 0, pg, ptr); } __attribute__((target("arch=armv8-a+sme"))) -void test_arch_sme(svbool_t pg, void *ptr) { +void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming { svld1_hor_vnum_za32(0, 0, pg, ptr, 0); } __attribute__((target("+sme"))) -void test_plus_sme(svbool_t pg, void *ptr) { +void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming { svst1_ver_za16(0, 0, pg, ptr); } +__attribute__((target("+sme"))) void undefined(svbool_t pg, void *ptr) { - svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}} + svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}} } diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 936724b9ce38..b19048eff5db 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -550,6 +550,8 @@ class NeonEmitter { void createIntrinsic(Record *R, SmallVectorImpl &Out); void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl &Defs); + void genStreamingSVECompatibleList(raw_ostream &OS, + SmallVectorImpl &Defs); void genOverloadTypeCheckCode(raw_ostream &OS, SmallVectorImpl &Defs); void genIntrinsicRangeCheckCode(raw_ostream &OS, @@ -2039,6 +2041,30 @@ void NeonEmitter::genBuiltinsDef(raw_ostream &OS, OS << "#endif\n\n"; } +void NeonEmitter::genStreamingSVECompatibleList( + raw_ostream &OS, SmallVectorImpl &Defs) { + OS << "#ifdef GET_NEON_STREAMING_COMPAT_FLAG\n"; + + std::set Emitted; + for (auto *Def : Defs) { + // If the def has a body (that is, it has Operation DAGs), it won't call + // __builtin_neon_* so we don't need to generate a definition for it. + if (Def->hasBody()) + continue; + + std::string Name = Def->getMangledName(); + if (Emitted.find(Name) != Emitted.end()) + continue; + + // FIXME: We should make exceptions here for some NEON builtins that are + // permitted in streaming mode. + OS << "case NEON::BI__builtin_neon_" << Name + << ": BuiltinType = ArmNonStreaming; break;\n"; + Emitted.insert(Name); + } + OS << "#endif\n\n"; +} + /// Generate the ARM and AArch64 overloaded type checking code for /// SemaChecking.cpp, checking for unique builtin declarations. void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS, @@ -2222,6 +2248,8 @@ void NeonEmitter::runHeader(raw_ostream &OS) { // Generate ARM overloaded type checking code for SemaChecking.cpp genOverloadTypeCheckCode(OS, Defs); + genStreamingSVECompatibleList(OS, Defs); + // Generate ARM range checking code for shift/lane immediates. genIntrinsicRangeCheckCode(OS, Defs); } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 545f2bcc7f62..a9b5817d760b 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -371,6 +371,9 @@ public: /// Emit all the information needed to map builtin -> LLVM IR intrinsic. void createSMECodeGenMap(raw_ostream &o); + /// Create a table for a builtin's requirement for PSTATE.SM. + void createStreamingAttrs(raw_ostream &o, ACLEKind Kind); + /// Emit all the range checks for the immediates. void createSMERangeChecks(raw_ostream &o); @@ -1624,6 +1627,51 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) { OS << "#endif\n\n"; } +void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + StringRef ExtensionKind; + switch (Kind) { + case ACLEKind::SME: + ExtensionKind = "SME"; + break; + case ACLEKind::SVE: + ExtensionKind = "SVE"; + break; + } + + OS << "#ifdef GET_" << ExtensionKind << "_STREAMING_ATTRS\n"; + + llvm::StringMap> StreamingMap; + + uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming"); + uint64_t IsStreamingCompatibleFlag = + getEnumValueForFlag("IsStreamingCompatible"); + for (auto &Def : Defs) { + if (Def->isFlagSet(IsStreamingFlag)) + StreamingMap["ArmStreaming"].insert(Def->getMangledName()); + else if (Def->isFlagSet(IsStreamingCompatibleFlag)) + StreamingMap["ArmStreamingCompatible"].insert(Def->getMangledName()); + else + StreamingMap["ArmNonStreaming"].insert(Def->getMangledName()); + } + + for (auto BuiltinType : StreamingMap.keys()) { + for (auto Name : StreamingMap[BuiltinType]) { + OS << "case " << ExtensionKind << "::BI__builtin_" + << ExtensionKind.lower() << "_"; + OS << Name << ":\n"; + } + OS << " BuiltinType = " << BuiltinType << ";\n"; + OS << " break;\n"; + } + + OS << "#endif\n\n"; +} + namespace clang { void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createHeader(OS); @@ -1645,6 +1693,10 @@ void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createTypeFlags(OS); } +void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE); +} + void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createSMEHeader(OS); } @@ -1660,4 +1712,8 @@ void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createSMERangeChecks(OS); } + +void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME); +} } // End namespace clang diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 38215abd9d9b..8800231a4314 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -83,10 +83,12 @@ enum ActionType { GenArmSveBuiltinCG, GenArmSveTypeFlags, GenArmSveRangeChecks, + GenArmSveStreamingAttrs, GenArmSmeHeader, GenArmSmeBuiltins, GenArmSmeBuiltinCG, GenArmSmeRangeChecks, + GenArmSmeStreamingAttrs, GenArmCdeHeader, GenArmCdeBuiltinDef, GenArmCdeBuiltinSema, @@ -233,6 +235,8 @@ cl::opt Action( "Generate arm_sve_typeflags.inc for clang"), clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks", "Generate arm_sve_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSveStreamingAttrs, "gen-arm-sve-streaming-attrs", + "Generate arm_sve_streaming_attrs.inc for clang"), clEnumValN(GenArmSmeHeader, "gen-arm-sme-header", "Generate arm_sme.h for clang"), clEnumValN(GenArmSmeBuiltins, "gen-arm-sme-builtins", @@ -241,6 +245,8 @@ cl::opt Action( "Generate arm_sme_builtin_cg_map.inc for clang"), clEnumValN(GenArmSmeRangeChecks, "gen-arm-sme-sema-rangechecks", "Generate arm_sme_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs", + "Generate arm_sme_streaming_attrs.inc for clang"), clEnumValN(GenArmMveHeader, "gen-arm-mve-header", "Generate arm_mve.h for clang"), clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", @@ -472,6 +478,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenArmSveRangeChecks: EmitSveRangeChecks(Records, OS); break; + case GenArmSveStreamingAttrs: + EmitSveStreamingAttrs(Records, OS); + break; case GenArmSmeHeader: EmitSmeHeader(Records, OS); break; @@ -484,6 +493,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenArmSmeRangeChecks: EmitSmeRangeChecks(Records, OS); break; + case GenArmSmeStreamingAttrs: + EmitSmeStreamingAttrs(Records, OS); + break; case GenArmCdeHeader: EmitCdeHeader(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 8265a531a98f..0504e8d03a92 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -102,11 +102,13 @@ void EmitSveBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -- Gitee From cb32cbee54679a16efc7e2c593587584a9bf7223 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 18 Dec 2023 16:14:25 +0000 Subject: [PATCH 50/77] [Clang][SME] Warn when a function doesn't have ZA state (#75805) This patch adds a warning that's emitted when a builtin call uses ZA state but the calling function doesn't provide any. Patch by David Sherwood . Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/CMakeLists.txt | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 3 + clang/lib/Sema/SemaChecking.cpp | 24 +++ .../aarch64-sme-intrinsics/acle_sme_add-i32.c | 16 +- .../aarch64-sme-intrinsics/acle_sme_add-i64.c | 16 +- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 20 +- .../acle_sme_ld1_vnum.c | 20 +- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 10 +- .../acle_sme_mopa-za32.c | 14 +- .../acle_sme_mopa-za64.c | 10 +- .../acle_sme_mops-za32.c | 14 +- .../acle_sme_mops-za64.c | 10 +- .../aarch64-sme-intrinsics/acle_sme_read.c | 192 +++++++++--------- .../aarch64-sme-intrinsics/acle_sme_st1.c | 20 +- .../acle_sme_st1_vnum.c | 20 +- .../aarch64-sme-intrinsics/acle_sme_str.c | 10 +- .../aarch64-sme-intrinsics/acle_sme_write.c | 192 +++++++++--------- .../aarch64-sme-intrinsics/acle_sme_zero.c | 8 +- .../Sema/aarch64-incompat-sm-builtin-calls.c | 5 + .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 14 +- .../aarch64-sme-intrinsics/acle_sme_target.c | 8 +- clang/utils/TableGen/SveEmitter.cpp | 32 +++ clang/utils/TableGen/TableGen.cpp | 6 + clang/utils/TableGen/TableGenBackends.h | 1 + 24 files changed, 371 insertions(+), 297 deletions(-) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index c5ccb641ca80..cd1fa11f8556 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -111,6 +111,9 @@ clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs SOURCE arm_sme.td TARGET ClangARMSmeStreamingAttrs) +clang_tablegen(arm_sme_builtins_za_state.inc -gen-arm-sme-builtin-za-state + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltinsZAState) clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def SOURCE arm_cde.td TARGET ClangARMCdeBuiltinsDef) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f30532464aa4..d960701ba53d 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3104,6 +3104,9 @@ def err_attribute_arm_feature_sve_bits_unsupported : Error< def warn_attribute_arm_sm_incompat_builtin : Warning< "builtin call has undefined behaviour when called from a %0 function">, InGroup>; +def warn_attribute_arm_za_builtin_no_za_state : Warning< + "builtin call is not valid when calling from a function without active ZA state">, + InGroup>; def err_sve_vector_in_non_sve_target : Error< "SVE vector type %0 cannot be used in a target without sve">; def err_attribute_riscv_rvv_bits_unsupported : Error< diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index efc1aeea42d2..51113bac126d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3052,6 +3052,25 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, } } +static bool hasSMEZAState(const FunctionDecl *FD) { + if (FD->hasAttr()) + return true; + if (const auto *T = FD->getType()->getAs()) + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + return true; + return false; +} + +static bool hasSMEZAState(unsigned BuiltinID) { + switch (BuiltinID) { + default: + return false; +#define GET_SME_BUILTIN_HAS_ZA_STATE +#include "clang/Basic/arm_sme_builtins_za_state.inc" +#undef GET_SME_BUILTIN_HAS_ZA_STATE + } +} + bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { if (const FunctionDecl *FD = getCurFunctionDecl()) { std::optional BuiltinType; @@ -3064,6 +3083,11 @@ bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { if (BuiltinType) checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + + if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD)) + Diag(TheCall->getBeginLoc(), + diag::warn_attribute_arm_za_builtin_no_za_state) + << TheCall->getSourceRange(); } // Range check SME intrinsics that take immediate values. diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index cf5de1f0526d..6cf7ebd2c16b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { +void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { +void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming { +void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming { +void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index 951262620965..1bab5a3f7597 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { +void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { +void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming { +void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming { +void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index ae972731e6e9..e12c5bbc9895 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -13,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za8(0, slice_base, pg, ptr); svld1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -27,7 +27,7 @@ void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za16(0, slice_base, pg, ptr); svld1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -41,7 +41,7 @@ void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za32(0, slice_base, pg, ptr); svld1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -55,7 +55,7 @@ void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za64(0, slice_base, pg, ptr); svld1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -68,7 +68,7 @@ void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za128(0, slice_base, pg, ptr); svld1_hor_za128(15, slice_base, pg, ptr); } @@ -81,7 +81,7 @@ void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_ver_za8(0, slice_base, pg, ptr); svld1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -95,7 +95,7 @@ void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_ver_za16(0, slice_base, pg, ptr); svld1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -109,7 +109,7 @@ void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_ver_za32(0, slice_base, pg, ptr); svld1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -123,7 +123,7 @@ void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_ver_za64(0, slice_base, pg, ptr); svld1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -136,7 +136,7 @@ void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming { +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { svld1_ver_za128(0, slice_base, pg, ptr); svld1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 84011615636e..09b1acb19108 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -16,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -33,7 +33,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -50,7 +50,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -67,7 +67,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -83,7 +83,7 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -99,7 +99,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -116,7 +116,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -133,7 +133,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -150,7 +150,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -166,7 +166,7 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming { +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 49f7854d355b..e19f8f927e98 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { +void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za { svldr_vnum_za(slice_base, ptr, 0); } @@ -21,7 +21,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { +void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za { svldr_vnum_za(slice_base, ptr, 15); } @@ -31,7 +31,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svldr_za(uint32_t slice_base, const void *ptr) { +void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za { svldr_za(slice_base, ptr); } @@ -42,7 +42,7 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { +void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_shared_za { svldr_vnum_za(slice_base, ptr, vnum); } @@ -52,6 +52,6 @@ void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) { +void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_shared_za { svldr_vnum_za(slice_base, ptr, 16); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index 4c0debe9a3a4..e762faa6ca4e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming { +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming { +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming { +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming { +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming { +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming { +void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming { +void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index 4c91281f31c6..e9789b96b1b0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming { +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming { +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming { +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming { +void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming { +void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index 68d0071e4af4..9256b8fadc7b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming { +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming { +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming { +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming { +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming { +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming { +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming { +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index d2852b0ee563..d2e4d2a5c09e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming { +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming { +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming { +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming { +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming { +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index 28a385e111f5..efc1f536c2e7 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -19,7 +19,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -30,7 +30,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice); } @@ -42,7 +42,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -54,7 +54,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice); } @@ -66,7 +66,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -78,7 +78,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice); } @@ -90,7 +90,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -102,7 +102,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice); } @@ -113,7 +113,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -124,7 +124,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice); } @@ -136,7 +136,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -148,7 +148,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice); } @@ -160,7 +160,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -172,7 +172,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice); } @@ -184,7 +184,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -196,7 +196,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice); } @@ -208,7 +208,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -220,7 +220,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice); } @@ -232,7 +232,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -244,7 +244,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -256,7 +256,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -268,7 +268,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice); } @@ -280,7 +280,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -292,7 +292,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice); } @@ -303,7 +303,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -313,7 +313,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -324,7 +324,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -335,7 +335,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -346,7 +346,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -357,7 +357,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -368,7 +368,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -379,7 +379,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -389,7 +389,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -399,7 +399,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -410,7 +410,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -421,7 +421,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -432,7 +432,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -443,7 +443,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -454,7 +454,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -465,7 +465,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -476,7 +476,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -487,7 +487,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -498,7 +498,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -509,7 +509,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -520,7 +520,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -531,7 +531,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -542,7 +542,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -553,7 +553,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base); } @@ -563,7 +563,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -574,7 +574,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice); } @@ -586,7 +586,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -598,7 +598,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice); } @@ -610,7 +610,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -622,7 +622,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice); } @@ -634,7 +634,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -646,7 +646,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice); } @@ -657,7 +657,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -668,7 +668,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice); } @@ -680,7 +680,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -692,7 +692,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice); } @@ -704,7 +704,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -716,7 +716,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice); } @@ -728,7 +728,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -740,7 +740,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice); } @@ -752,7 +752,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -764,7 +764,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice); } @@ -776,7 +776,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -788,7 +788,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -800,7 +800,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -812,7 +812,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice); } @@ -824,7 +824,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -836,7 +836,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice); } @@ -847,7 +847,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -857,7 +857,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -868,7 +868,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -879,7 +879,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -890,7 +890,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -901,7 +901,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -912,7 +912,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -923,7 +923,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -933,7 +933,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -943,7 +943,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -954,7 +954,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -965,7 +965,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -976,7 +976,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -987,7 +987,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -998,7 +998,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -1009,7 +1009,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -1020,7 +1020,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -1031,7 +1031,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -1042,7 +1042,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -1053,7 +1053,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -1064,7 +1064,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -1075,7 +1075,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -1086,7 +1086,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -1097,6 +1097,6 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { +svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index 31708906f8c0..052eceffccda 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -13,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_hor_za8(0, slice_base, pg, ptr); svst1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -27,7 +27,7 @@ void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_hor_za16(0, slice_base, pg, ptr); svst1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -41,7 +41,7 @@ void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_hor_za32(0, slice_base, pg, ptr); svst1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -55,7 +55,7 @@ void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_hor_za64(0, slice_base, pg, ptr); svst1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -68,7 +68,7 @@ void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_hor_za128(0, slice_base, pg, ptr); svst1_hor_za128(15, slice_base, pg, ptr); } @@ -81,7 +81,7 @@ void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_str // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za8(0, slice_base, pg, ptr); svst1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -95,7 +95,7 @@ void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za16(0, slice_base, pg, ptr); svst1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -109,7 +109,7 @@ void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za32(0, slice_base, pg, ptr); svst1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -123,7 +123,7 @@ void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za64(0, slice_base, pg, ptr); svst1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -136,7 +136,7 @@ void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming { +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za128(0, slice_base, pg, ptr); svst1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index e6884739f6ba..3892050d8855 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -16,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -33,7 +33,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -50,7 +50,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -67,7 +67,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -83,7 +83,7 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -99,7 +99,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -116,7 +116,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -133,7 +133,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -150,7 +150,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -166,7 +166,7 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming { +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index aebc1d56be25..5256b63907ae 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { +void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za { svstr_vnum_za(slice_base, ptr, 0); } @@ -21,7 +21,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { +void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za { svstr_vnum_za(slice_base, ptr, 15); } @@ -31,7 +31,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svstr_za(uint32_t slice_base, void *ptr) { +void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za { svstr_za(slice_base, ptr); } @@ -42,7 +42,7 @@ void test_svstr_za(uint32_t slice_base, void *ptr) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { +void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_shared_za { svstr_vnum_za(slice_base, ptr, vnum); } @@ -52,6 +52,6 @@ void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) { +void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_shared_za { svstr_vnum_za(slice_base, ptr, 16); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index f574eec13ecb..7a2dafcce66d 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -30,7 +30,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn); } @@ -42,7 +42,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -54,7 +54,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn); } @@ -66,7 +66,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -78,7 +78,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn); } @@ -90,7 +90,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -102,7 +102,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn); } @@ -113,7 +113,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -124,7 +124,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn); } @@ -136,7 +136,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -148,7 +148,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn); } @@ -160,7 +160,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -172,7 +172,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn); } @@ -184,7 +184,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -196,7 +196,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn); } @@ -208,7 +208,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -220,7 +220,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn); } @@ -232,7 +232,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -244,7 +244,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn); } @@ -256,7 +256,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -268,7 +268,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn); } @@ -280,7 +280,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -292,7 +292,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn); } @@ -303,7 +303,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -313,7 +313,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -324,7 +324,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -335,7 +335,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -346,7 +346,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -357,7 +357,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -368,7 +368,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -379,7 +379,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -389,7 +389,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -399,7 +399,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -410,7 +410,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -421,7 +421,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -432,7 +432,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -443,7 +443,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -454,7 +454,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -465,7 +465,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -476,7 +476,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -487,7 +487,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -498,7 +498,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -509,7 +509,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -520,7 +520,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -531,7 +531,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -542,7 +542,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -553,7 +553,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn); } @@ -563,7 +563,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -574,7 +574,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn); } @@ -586,7 +586,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -598,7 +598,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn); } @@ -610,7 +610,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -622,7 +622,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn); } @@ -634,7 +634,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -646,7 +646,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn); } @@ -657,7 +657,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -668,7 +668,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn); } @@ -680,7 +680,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -692,7 +692,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn); } @@ -704,7 +704,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -716,7 +716,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn); } @@ -728,7 +728,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -740,7 +740,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn); } @@ -752,7 +752,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -764,7 +764,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn); } @@ -776,7 +776,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -788,7 +788,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn); } @@ -800,7 +800,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -812,7 +812,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn); } @@ -824,7 +824,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -836,7 +836,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn); } @@ -847,7 +847,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -857,7 +857,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming { +void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -868,7 +868,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -879,7 +879,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming { +void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -890,7 +890,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -901,7 +901,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming { +void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -912,7 +912,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -923,7 +923,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming { +void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -933,7 +933,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -943,7 +943,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming { +void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -954,7 +954,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -965,7 +965,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming { +void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -976,7 +976,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -987,7 +987,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming { +void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -998,7 +998,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -1009,7 +1009,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming { +void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -1020,7 +1020,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -1031,7 +1031,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -1042,7 +1042,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -1053,7 +1053,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming { +void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -1064,7 +1064,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -1075,7 +1075,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming { +void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -1086,7 +1086,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -1097,6 +1097,6 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming { +void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index 3ff9f6346c49..9aebe9d42cbf 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 0) // CHECK-NEXT: ret void // -void test_svzero_mask_za() { +void test_svzero_mask_za(void) __arm_shared_za { svzero_mask_za(0); } @@ -21,7 +21,7 @@ void test_svzero_mask_za() { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 176) // CHECK-NEXT: ret void // -void test_svzero_mask_za_1() { +void test_svzero_mask_za_1(void) __arm_shared_za { svzero_mask_za(176); } @@ -31,7 +31,7 @@ void test_svzero_mask_za_1() { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_mask_za_2() { +void test_svzero_mask_za_2(void) __arm_shared_za { svzero_mask_za(255); } @@ -41,6 +41,6 @@ void test_svzero_mask_za_2() { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_za() { +void test_svzero_za(void) __arm_shared_za { svzero_za(); } diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 361a9e82a3ad..e63d9f0a8475 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -97,3 +97,8 @@ svbool_t streaming_caller_ptrue(void) __arm_streaming { // expected-no-warning return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4)); } + +svint8_t missing_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + return svread_hor_za8_s8_m(zd, pg, 0, slice_base); +} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 47c7210206b0..529d0d2d1e62 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -12,7 +12,7 @@ #include -void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} @@ -32,7 +32,7 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8()); } -void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} @@ -52,7 +52,7 @@ void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16()); } -void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} @@ -90,7 +90,7 @@ void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8()); } -void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} @@ -133,7 +133,7 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64()); } -void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} @@ -153,14 +153,14 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming { SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8()); } -void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming { +void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(256); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(-1); } -void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming { +void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}} SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c index 7cfe9fdfbd24..95bb6be2d2d3 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -6,21 +6,21 @@ #include __attribute__((target("sme"))) -void test_sme(svbool_t pg, void *ptr) __arm_streaming { +void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svld1_hor_za8(0, 0, pg, ptr); } __attribute__((target("arch=armv8-a+sme"))) -void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming { +void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svld1_hor_vnum_za32(0, 0, pg, ptr, 0); } __attribute__((target("+sme"))) -void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming { +void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { svst1_ver_za16(0, 0, pg, ptr); } __attribute__((target("+sme"))) -void undefined(svbool_t pg, void *ptr) { +void undefined(svbool_t pg, void *ptr) __arm_shared_za { svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}} } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index a9b5817d760b..874b8e4e1893 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -377,6 +377,9 @@ public: /// Emit all the range checks for the immediates. void createSMERangeChecks(raw_ostream &o); + /// Create a table for a builtin's requirement for PSTATE.ZA. + void createBuiltinZAState(raw_ostream &OS); + /// Create intrinsic and add it to \p Out void createIntrinsic(Record *R, SmallVectorImpl> &Out); @@ -1627,6 +1630,31 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) { OS << "#endif\n\n"; } +void SVEEmitter::createBuiltinZAState(raw_ostream &OS) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + std::map> DefsZAState; + + uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA"); + for (auto &Def : Defs) { + bool HasZAState = Def->isFlagSet(IsSharedZAFlag); + DefsZAState[HasZAState].insert(Def->getMangledName()); + } + + OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n"; + + for (auto HasZA : {true, false}) { + auto Names = DefsZAState[HasZA]; + for (auto Name : Names) + OS << "case SME::BI__builtin_sme_" << Name << ":\n"; + OS << " return " << (HasZA ? "true" : "false") << ";\n"; + } + OS << "#endif\n\n"; +} + void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; @@ -1716,4 +1744,8 @@ void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) { void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME); } + +void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createBuiltinZAState(OS); +} } // End namespace clang diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 8800231a4314..4ff7b7d43aab 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -89,6 +89,7 @@ enum ActionType { GenArmSmeBuiltinCG, GenArmSmeRangeChecks, GenArmSmeStreamingAttrs, + GenArmSmeBuiltinZAState, GenArmCdeHeader, GenArmCdeBuiltinDef, GenArmCdeBuiltinSema, @@ -247,6 +248,8 @@ cl::opt Action( "Generate arm_sme_sema_rangechecks.inc for clang"), clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs", "Generate arm_sme_streaming_attrs.inc for clang"), + clEnumValN(GenArmSmeBuiltinZAState, "gen-arm-sme-builtin-za-state", + "Generate arm_sme_builtins_za_state.inc for clang"), clEnumValN(GenArmMveHeader, "gen-arm-mve-header", "Generate arm_mve.h for clang"), clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", @@ -496,6 +499,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenArmSmeStreamingAttrs: EmitSmeStreamingAttrs(Records, OS); break; + case GenArmSmeBuiltinZAState: + EmitSmeBuiltinZAState(Records, OS); + break; case GenArmCdeHeader: EmitCdeHeader(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 0504e8d03a92..ce552ebe73db 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -109,6 +109,7 @@ void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -- Gitee From 017e576a0bee905434ce3463d818bb114639b1f7 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 2 Jan 2024 09:43:30 +0000 Subject: [PATCH 51/77] [Clang][AArch64] Add missing SME functions to header file. (#75791) This includes: * __arm_in_streaming_mode() * __arm_has_sme() * __arm_za_disable() * __svundef_za() Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/BuiltinsAArch64.def | 3 + clang/lib/CodeGen/CGBuiltin.cpp | 20 ++++++ .../acle_sme_state_funs.c | 72 +++++++++++++++++++ clang/utils/TableGen/SveEmitter.cpp | 19 +++++ 4 files changed, 114 insertions(+) create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def index eaae6c9ad846..33eb1dc2f6d8 100644 --- a/clang/include/clang/Basic/BuiltinsAArch64.def +++ b/clang/include/clang/Basic/BuiltinsAArch64.def @@ -68,6 +68,9 @@ TARGET_BUILTIN(__builtin_arm_ldg, "v*v*", "t", "mte") TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte") TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte") +// SME state function +BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n") + // Memory Operations TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 07d6c20118e0..87118ed823a7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10062,6 +10062,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); } + if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) { + // Create call to __arm_sme_state and store the results to the two pointers. + CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, + false), + "__arm_sme_state")); + auto Attrs = + AttributeList() + .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved"); + CI->setAttributes(Attrs); + CI->setCallingConv( + llvm::CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + Builder.CreateStore(Builder.CreateExtractValue(CI, 0), + EmitPointerWithAlignment(E->getArg(0))); + return Builder.CreateStore(Builder.CreateExtractValue(CI, 1), + EmitPointerWithAlignment(E->getArg(1))); + } + if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) { assert((getContext().getTypeSize(E->getType()) == 32) && "rbit of unusual size!"); diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c new file mode 100644 index 000000000000..282819c8ca35 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -0,0 +1,72 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_in_streaming_mode( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_in_streaming_mode(void) __arm_streaming_compatible { + return __arm_in_streaming_mode(); +} + +// CHECK-LABEL: @test_za_disable( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]] +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_za_disablev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]] +// CPP-CHECK-NEXT: ret void +// +void test_za_disable(void) __arm_streaming_compatible { + __arm_za_disable(); +} + +// CHECK-LABEL: @test_has_sme( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z12test_has_smev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_has_sme(void) __arm_streaming_compatible { + return __arm_has_sme(); +} + +// CHECK-LABEL: @test_svundef_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svundef_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: ret void +// +void test_svundef_za(void) __arm_streaming_compatible __arm_shared_za { + svundef_za(); +} + diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 874b8e4e1893..c239b9d300df 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1525,6 +1525,25 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "extern \"C\" {\n"; OS << "#endif\n\n"; + OS << "void __arm_za_disable(void) __arm_streaming_compatible;\n\n"; + + OS << "__ai bool __arm_has_sme(void) __arm_streaming_compatible {\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & (1ULL << 63);\n"; + OS << "}\n\n"; + + OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible " + "{\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & 1;\n"; + OS << "}\n\n"; + + OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) " + "__arm_streaming_compatible __arm_shared_za " + "{ }\n\n"; + createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME); OS << "#ifdef __cplusplus\n"; -- Gitee From e0119880ed0d31a0594f737fef74122831c24096 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 11 Jan 2024 14:47:32 +0000 Subject: [PATCH 52/77] [AArch64] Enable certain instruction aliases for SVE/SME (#77745) Several SVE instruction aliases accept predicate-as-counter register names as a convenience. These ought to be enabled with SVE/SME because the underlying encoding is valid and it's required by Arm ARM. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 +++- .../SVE/predicate-as-counter-aliases.s | 22 +++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 741d228b3cad..1d00f152940a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3918,7 +3918,9 @@ defm WHILEHS_CXX : sve2p1_int_while_rr_pn<"whilehs", 0b100>; defm WHILEHI_CXX : sve2p1_int_while_rr_pn<"whilehi", 0b101>; defm WHILELO_CXX : sve2p1_int_while_rr_pn<"whilelo", 0b110>; defm WHILELS_CXX : sve2p1_int_while_rr_pn<"whilels", 0b111>; +} // End HasSVE2p1_or_HasSME2 +let Predicates = [HasSVEorSME] in { // Aliases for existing SVE instructions for which predicate-as-counter are // accepted as an operand to the instruction @@ -3937,7 +3939,7 @@ def : InstAlias<"mov $Pd, $Pn", def : InstAlias<"pfalse\t$Pd", (PFALSE PNRasPPR8:$Pd), 0>; -} // End HasSVE2p1_or_HasSME2 +} //===----------------------------------------------------------------------===// // SVE2.1 non-widening BFloat16 to BFloat16 instructions diff --git a/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s index bca2cf913ff6..fd6c59888e09 100644 --- a/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s +++ b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s @@ -1,50 +1,50 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p1 < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2 < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \ -// RUN: | llvm-objdump --no-print-imm-hex -d --mattr=+sve2p1 - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve < %s \ +// RUN: | llvm-objdump --no-print-imm-hex -d --mattr=+sve - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve < %s \ // RUN: | llvm-objdump --no-print-imm-hex -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN ldr pn0, [x0] // CHECK-INST: ldr p0, [x0] // CHECK-ENCODING: [0x00,0x00,0x80,0x85] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: 85800000 ldr pn5, [x10, #255, mul vl] // CHECK-INST: ldr p5, [x10, #255, mul vl] // CHECK-ENCODING: [0x45,0x1d,0x9f,0x85] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: 859f1d45 str pn0, [x0] // CHECK-INST: str p0, [x0] // CHECK-ENCODING: [0x00,0x00,0x80,0xe5] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: e5800000 str pn5, [x10, #255, mul vl] // CHECK-INST: str p5, [x10, #255, mul vl] // CHECK-ENCODING: [0x45,0x1d,0x9f,0xe5] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: e59f1d45 mov pn0.b, pn0.b // CHECK-INST: mov p0.b, p0.b // CHECK-ENCODING: [0x00,0x40,0x80,0x25] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: 25804000 pfalse pn15.b // CHECK-INST: pfalse p15.b // CHECK-ENCODING: [0x0f,0xe4,0x18,0x25] -// CHECK-ERROR: instruction requires: sme2 or sve2p1 +// CHECK-ERROR: instruction requires: sve or sme // CHECK-UNKNOWN: 2518e40f -- Gitee From 93cbf2aa3d2d55cf3a90d10e8f43b4fcef8a523a Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 15 Jan 2024 09:41:32 +0000 Subject: [PATCH 53/77] [Clang][AArch64] Change SME attributes for shared/new/preserved state. (#76971) This patch replaces the `__arm_new_za`, `__arm_shared_za` and `__arm_preserves_za` attributes in favour of: * `__arm_new("za")` * `__arm_in("za")` * `__arm_out("za")` * `__arm_inout("za")` * `__arm_preserves("za")` As described in https://github.com/ARM-software/acle/pull/276. One change is that `__arm_in/out/inout/preserves(S)` are all mutually exclusive, whereas previously it was fine to write `__arm_shared_za __arm_preserves_za`. This case is now represented with `__arm_in("za")`. The current implementation uses the same LLVM attributes under the hood, since `__arm_in/out/inout` are all variations of "shared ZA", so can use the existing `aarch64_pstate_za_shared` attribute in LLVM. with SME2. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/AST/Type.h | 23 +- clang/include/clang/Basic/Attr.td | 49 ++- clang/include/clang/Basic/AttrDocs.td | 82 +++- .../include/clang/Basic/AttributeCommonInfo.h | 13 + clang/include/clang/Basic/CMakeLists.txt | 4 +- .../clang/Basic/DiagnosticSemaKinds.td | 6 + clang/include/clang/Basic/TokenKinds.def | 4 +- clang/include/clang/Basic/TokenKinds.h | 2 +- clang/include/clang/Sema/Sema.h | 10 +- clang/lib/AST/TypePrinter.cpp | 31 +- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- clang/lib/CodeGen/CGCall.cpp | 18 +- clang/lib/CodeGen/CodeGenModule.cpp | 6 +- clang/lib/Parse/ParseDecl.cpp | 6 + clang/lib/Parse/ParseDeclCXX.cpp | 24 +- clang/lib/Parse/ParseTentative.cpp | 3 +- clang/lib/Sema/SemaChecking.cpp | 27 +- clang/lib/Sema/SemaDecl.cpp | 9 +- clang/lib/Sema/SemaDeclAttr.cpp | 78 +++- clang/lib/Sema/SemaDeclCXX.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 27 +- clang/lib/Sema/SemaOverload.cpp | 20 - clang/lib/Sema/SemaType.cpp | 80 +++- clang/test/AST/ast-dump-sme-attributes.cpp | 12 +- .../aarch64-sme-attrs.cpp | 36 +- .../aarch64-sme-intrinsics/acle_sme_add-i32.c | 16 +- .../aarch64-sme-intrinsics/acle_sme_add-i64.c | 16 +- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 20 +- .../acle_sme_ld1_vnum.c | 20 +- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 10 +- .../acle_sme_mopa-za32.c | 14 +- .../acle_sme_mopa-za64.c | 10 +- .../acle_sme_mops-za32.c | 14 +- .../acle_sme_mops-za64.c | 10 +- .../aarch64-sme-intrinsics/acle_sme_read.c | 192 ++++----- .../aarch64-sme-intrinsics/acle_sme_st1.c | 20 +- .../acle_sme_st1_vnum.c | 20 +- .../acle_sme_state_funs.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_str.c | 10 +- .../aarch64-sme-intrinsics/acle_sme_write.c | 192 ++++----- .../aarch64-sme-intrinsics/acle_sme_zero.c | 8 +- .../aarch64-sme2-intrinsics/acle_sme2_add.c | 56 +-- .../aarch64-sme2-intrinsics/acle_sme2_sub.c | 56 +-- clang/test/Modules/aarch64-sme-keywords.cppm | 8 +- clang/test/Parser/c2x-attribute-keywords.c | 161 ++++---- clang/test/Parser/c2x-attribute-keywords.m | 4 +- .../test/Parser/cxx0x-keyword-attributes.cpp | 367 +++++++++--------- .../Sema/aarch64-incompat-sm-builtin-calls.c | 4 +- ...-sme-func-attrs-without-target-feature.cpp | 12 +- clang/test/Sema/aarch64-sme-func-attrs.c | 202 ++++++---- .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 14 +- .../aarch64-sme-intrinsics/acle_sme_target.c | 8 +- clang/utils/TableGen/ClangAttrEmitter.cpp | 32 +- clang/utils/TableGen/SveEmitter.cpp | 2 +- clang/utils/TableGen/TableGen.cpp | 21 +- clang/utils/TableGen/TableGenBackends.h | 4 +- 56 files changed, 1203 insertions(+), 898 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 6f843015f85f..80cfd50d91df 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -3973,12 +3973,27 @@ public: SME_NormalFunction = 0, SME_PStateSMEnabledMask = 1 << 0, SME_PStateSMCompatibleMask = 1 << 1, - SME_PStateZASharedMask = 1 << 2, - SME_PStateZAPreservedMask = 1 << 3, - SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of the - // bitmask in FunctionTypeExtraBitfields. + + // Describes the value of the state using ArmStateValue. + SME_ZAShift = 2, + SME_ZAMask = 0b111 << SME_ZAShift, + + SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of + // the bitmask in FunctionTypeExtraBitfields. + }; + + enum ArmStateValue : unsigned { + ARM_None = 0, + ARM_Preserves = 1, + ARM_In = 2, + ARM_Out = 3, + ARM_InOut = 4, }; + static ArmStateValue getArmZAState(unsigned AttrBits) { + return (ArmStateValue)((AttrBits & SME_ZAMask) >> SME_ZAShift); + } + /// A simple holder for various uncommon bits which do not fit in /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the /// alignment of subsequent objects in TrailingObjects. diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 20d4c7262884..ad0ad208a790 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2445,16 +2445,45 @@ def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr { let Documentation = [ArmSmeStreamingCompatibleDocs]; } -def ArmSharedZA : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_shared_za">]; +def ArmNew : InheritableAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_new">]; + let Args = [VariadicStringArgument<"NewArgs">]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [ArmNewDocs]; + + let AdditionalMembers = [{ + bool isNewZA() const { + return llvm::is_contained(newArgs(), "za"); + } + }]; +} + +def ArmIn : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_in">]; + let Args = [VariadicStringArgument<"InArgs">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmInDocs]; +} + +def ArmOut : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_out">]; + let Args = [VariadicStringArgument<"OutArgs">]; + let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; + let Documentation = [ArmOutDocs]; +} + +def ArmInOut : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_inout">]; + let Args = [VariadicStringArgument<"InOutArgs">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; - let Documentation = [ArmSmeSharedZADocs]; + let Documentation = [ArmInOutDocs]; } -def ArmPreservesZA : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_preserves_za">]; +def ArmPreserves : TypeAttr, TargetSpecificAttr { + let Spellings = [RegularKeyword<"__arm_preserves">]; + let Args = [VariadicStringArgument<"PreserveArgs">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; - let Documentation = [ArmSmePreservesZADocs]; + let Documentation = [ArmPreservesDocs]; } def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { @@ -2463,14 +2492,6 @@ def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { let Documentation = [ArmSmeLocallyStreamingDocs]; } -def ArmNewZA : InheritableAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_new_za">]; - let Subjects = SubjectList<[Function], ErrorDiag>; - let Documentation = [ArmSmeNewZADocs]; -} -def : MutualExclusions<[ArmNewZA, ArmSharedZA]>; -def : MutualExclusions<[ArmNewZA, ArmPreservesZA]>; - def Pure : InheritableAttr { let Spellings = [GCC<"pure">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 708d0b0fcfcb..395982c45ecf 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6652,30 +6652,73 @@ without changing modes. }]; } -def ArmSmeSharedZADocs : Documentation { +def ArmInDocs : Documentation { let Category = DocCatArmSmeAttributes; let Content = [{ -The ``__arm_shared_za`` keyword applies to prototyped function types and specifies -that the function shares SME's matrix storage (ZA) with its caller. This -means that: +The ``__arm_in`` keyword applies to prototyped function types and specifies +that the function shares a given state S with its caller. For ``__arm_in``, the +function takes the state S as input and returns with the state S unchanged. -* the function requires that the processor implements the Scalable Matrix - Extension (SME). +The attribute takes string arguments to instruct the compiler which state +is shared. The supported states for S are: -* the function enters with ZA in an active state. +* ``"za"`` for Matrix Storage (requires SME) -* the function returns with ZA in an active state. +The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and +``__arm_preserves(S)`` are all mutually exclusive for the same state S. }]; } -def ArmSmePreservesZADocs : Documentation { +def ArmOutDocs : Documentation { let Category = DocCatArmSmeAttributes; let Content = [{ -The ``__arm_preserves_za`` keyword applies to prototyped function types and -specifies that the function does not modify ZA state. +The ``__arm_out`` keyword applies to prototyped function types and specifies +that the function shares a given state S with its caller. For ``__arm_out``, +the function ignores the incoming state for S and returns new state for S. + +The attribute takes string arguments to instruct the compiler which state +is shared. The supported states for S are: + +* ``"za"`` for Matrix Storage (requires SME) + +The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and +``__arm_preserves(S)`` are all mutually exclusive for the same state S. }]; } +def ArmInOutDocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_inout`` keyword applies to prototyped function types and specifies +that the function shares a given state S with its caller. For ``__arm_inout``, +the function takes the state S as input and returns new state for S. + +The attribute takes string arguments to instruct the compiler which state +is shared. The supported states for S are: + +* ``"za"`` for Matrix Storage (requires SME) + +The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and +``__arm_preserves(S)`` are all mutually exclusive for the same state S. + }]; +} + +def ArmPreservesDocs : Documentation { + let Category = DocCatArmSmeAttributes; + let Content = [{ +The ``__arm_preserves`` keyword applies to prototyped function types and +specifies that the function does not read a given state S and returns +with state S unchanged. + +The attribute takes string arguments to instruct the compiler which state +is shared. The supported states for S are: + +* ``"za"`` for Matrix Storage (requires SME) + +The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and +``__arm_preserves(S)`` are all mutually exclusive for the same state S. + }]; +} def ArmSmeLocallyStreamingDocs : Documentation { let Category = DocCatArmSmeAttributes; @@ -6698,13 +6741,18 @@ at the end of the function. }]; } -def ArmSmeNewZADocs : Documentation { +def ArmNewDocs : Documentation { let Category = DocCatArmSmeAttributes; let Content = [{ -The ``__arm_new_za`` keyword applies to function declarations and specifies -that the function will be set up with a fresh ZA context. +The ``__arm_new`` keyword applies to function declarations and specifies +that the function will create a new scope for state S. + +The attribute takes string arguments to instruct the compiler for which state +to create new scope. The supported states for S are: + +* ``"za"`` for Matrix Storage (requires SME) -This means that: +For state ``"za"``, this means that: * the function requires that the target processor implements the Scalable Matrix Extension (SME). @@ -6715,8 +6763,8 @@ This means that: * the function will disable PSTATE.ZA (by setting it to 0) before returning. -For ``__arm_new_za`` functions Clang will set up the ZA context automatically -on entry to the function, and disable it before returning. For example, if ZA is +For ``__arm_new("za")`` functions Clang will set up the ZA context automatically +on entry to the function and disable it before returning. For example, if ZA is in a dormant state Clang will generate the code to commit a lazy-save and set up a new ZA state before executing user code. }]; diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h index 6396c0dc6ef0..99846f20482c 100644 --- a/clang/include/clang/Basic/AttributeCommonInfo.h +++ b/clang/include/clang/Basic/AttributeCommonInfo.h @@ -235,6 +235,19 @@ protected: return SpellingIndex != SpellingNotCalculated; } }; + +inline bool doesKeywordAttributeTakeArgs(tok::TokenKind Kind) { + switch (Kind) { + default: + return false; +#define KEYWORD_ATTRIBUTE(NAME, HASARG, ...) \ + case tok::kw_##NAME: \ + return HASARG; +#include "clang/Basic/RegularKeywordAttrInfo.inc" +#undef KEYWORD_ATTRIBUTE + } +} + } // namespace clang #endif // LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index cd1fa11f8556..096a85af4abd 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -48,10 +48,10 @@ clang_tablegen(AttrSubMatchRulesList.inc -gen-clang-attr-subject-match-rule-list SOURCE Attr.td TARGET ClangAttrSubjectMatchRuleList) -clang_tablegen(AttrTokenKinds.inc -gen-clang-attr-token-kinds +clang_tablegen(RegularKeywordAttrInfo.inc -gen-clang-regular-keyword-attr-info -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE Attr.td - TARGET ClangAttrTokenKinds + TARGET ClangRegularKeywordAttrInfo ) clang_tablegen(AttrHasAttributeImpl.inc -gen-clang-attr-has-attribute-impl diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index d960701ba53d..e304ef6aec1c 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3641,6 +3641,12 @@ def err_sme_definition_using_sm_in_non_sme_target : Error< "function executed in streaming-SVE mode requires 'sme'">; def err_sme_definition_using_za_in_non_sme_target : Error< "function using ZA state requires 'sme'">; +def err_conflicting_attributes_arm_state : Error< + "conflicting attributes for state '%0'">; +def err_unknown_arm_state : Error< + "unknown state '%0'">; +def err_missing_arm_state : Error< + "missing state for %0">; def err_cconv_change : Error< "function declared '%0' here was previously declared " "%select{'%2'|without calling convention}1">; diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index afd101b007b4..a540a3ee1bf0 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -753,9 +753,9 @@ KEYWORD(__builtin_sycl_unique_stable_name, KEYSYCL) // Keywords defined by Attr.td. #ifndef KEYWORD_ATTRIBUTE -#define KEYWORD_ATTRIBUTE(X, EMPTY) KEYWORD(EMPTY ## X, KEYALL) +#define KEYWORD_ATTRIBUTE(X, HASARG, EMPTY) KEYWORD(EMPTY ## X, KEYALL) #endif -#include "clang/Basic/AttrTokenKinds.inc" +#include "clang/Basic/RegularKeywordAttrInfo.inc" // Clang-specific keywords enabled only in testing. TESTING_KEYWORD(__unknown_anytype , KEYALL) diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index ff117bd5afc5..7529b922619a 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -110,7 +110,7 @@ bool isPragmaAnnotation(TokenKind K); inline constexpr bool isRegularKeywordAttribute(TokenKind K) { return (false #define KEYWORD_ATTRIBUTE(X, ...) || (K == tok::kw_##X) -#include "clang/Basic/AttrTokenKinds.inc" +#include "clang/Basic/RegularKeywordAttrInfo.inc" ); } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 2c6085b1b223..86cee3c728dc 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -7076,15 +7076,7 @@ public: NestedNameSpecInfo &IdInfo, bool EnteringContext); - /// The kind of conversion to check for. Either all attributes must match exactly, - /// or the converted type may add/drop '__arm_preserves_za'. - enum class AArch64SMECallConversionKind { - MatchExactly, - MayAddPreservesZA, - MayDropPreservesZA, - }; - bool IsInvalidSMECallConversion(QualType FromType, QualType ToType, - AArch64SMECallConversionKind C); + bool IsInvalidSMECallConversion(QualType FromType, QualType ToType); /// The parser has parsed a nested-name-specifier /// 'template[opt] template-name < template-args >::'. diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index eb69d0bb8755..31e6e56e7f69 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -937,15 +937,20 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T, OS << ')'; FunctionType::ExtInfo Info = T->getExtInfo(); + unsigned SMEBits = T->getAArch64SMEAttributes(); - if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask)) + if (SMEBits & FunctionType::SME_PStateSMCompatibleMask) OS << " __arm_streaming_compatible"; - if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)) + if (SMEBits & FunctionType::SME_PStateSMEnabledMask) OS << " __arm_streaming"; - if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask)) - OS << " __arm_shared_za"; - if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask)) - OS << " __arm_preserves_za"; + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves) + OS << " __arm_preserves(\"za\")"; + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In) + OS << " __arm_in(\"za\")"; + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out) + OS << " __arm_out(\"za\")"; + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut) + OS << " __arm_inout(\"za\")"; printFunctionAfter(Info, OS); @@ -1785,14 +1790,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, OS << "__arm_streaming_compatible"; return; } - if (T->getAttrKind() == attr::ArmSharedZA) { - OS << "__arm_shared_za"; - return; - } - if (T->getAttrKind() == attr::ArmPreservesZA) { - OS << "__arm_preserves_za"; - return; - } OS << " __attribute__(("; switch (T->getAttrKind()) { @@ -1836,8 +1833,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::WebAssemblyFuncref: case attr::ArmStreaming: case attr::ArmStreamingCompatible: - case attr::ArmSharedZA: - case attr::ArmPreservesZA: + case attr::ArmIn: + case attr::ArmOut: + case attr::ArmInOut: + case attr::ArmPreserves: llvm_unreachable("This attribute should have been handled already"); case attr::NSReturnsRetained: diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 87118ed823a7..e32feb378770 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10015,7 +10015,7 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); Value *Call = Builder.CreateCall(F, Ops); - return FormSVEBuiltinResult(Call); + return Call; } Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 265490ccfa9b..b6c7e7e576b9 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1763,14 +1763,21 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx, FPT->isNothrow()) FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); - if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + unsigned SMEBits = FPT->getAArch64SMEAttributes(); + if (SMEBits & FunctionType::SME_PStateSMEnabledMask) FuncAttrs.addAttribute("aarch64_pstate_sm_enabled"); - if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + if (SMEBits & FunctionType::SME_PStateSMCompatibleMask) FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); - if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + + // ZA + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out || + FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut) + FuncAttrs.addAttribute("aarch64_pstate_za_shared"); + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves || + FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In) { FuncAttrs.addAttribute("aarch64_pstate_za_shared"); - if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask) FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); + } } static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs, @@ -2414,9 +2421,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, if (TargetDecl->hasAttr()) FuncAttrs.addAttribute("aarch64_pstate_sm_body"); - - if (TargetDecl->hasAttr()) - FuncAttrs.addAttribute("aarch64_pstate_za_new"); } // Attach "no-builtins" attributes to: diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 722d3dc2c63e..09b3a6b484e8 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2293,8 +2293,10 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, if (D->hasAttr()) B.addAttribute("aarch64_pstate_sm_body"); - if (D->hasAttr()) - B.addAttribute("aarch64_pstate_za_new"); + if (auto *Attr = D->getAttr()) { + if (Attr->isNewZA()) + B.addAttribute("aarch64_pstate_za_new"); + } // Track whether we need to add the optnone LLVM attribute, // starting with the default for this optimization level. diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 97cd51359b62..31a71be8a0f5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -6786,7 +6786,13 @@ void Parser::ParseDirectDeclarator(Declarator &D) { } else if (Tok.isRegularKeywordAttribute()) { // For consistency with attribute parsing. Diag(Tok, diag::err_keyword_not_allowed) << Tok.getIdentifierInfo(); + bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind()); ConsumeToken(); + if (TakesArgs) { + BalancedDelimiterTracker T(*this, tok::l_paren); + if (!T.consumeOpen()) + T.skipToEnd(); + } } else if (Tok.is(tok::kw_requires) && D.hasGroupingParens()) { // This declarator is declaring a function, but the requires clause is // in the wrong place: diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index d9ff6c42c502..d5c09e943f6a 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -1887,7 +1887,13 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, if (!SkipUntil(tok::r_paren, StopAtSemi)) break; } else if (Tok.isRegularKeywordAttribute()) { + bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind()); ConsumeToken(); + if (TakesArgs) { + BalancedDelimiterTracker T(*this, tok::l_paren); + if (!T.consumeOpen()) + T.skipToEnd(); + } } else { break; } @@ -4510,8 +4516,18 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs, if (Tok.isRegularKeywordAttribute()) { SourceLocation Loc = Tok.getLocation(); IdentifierInfo *AttrName = Tok.getIdentifierInfo(); - Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Tok.getKind()); + ParsedAttr::Form Form = ParsedAttr::Form(Tok.getKind()); + bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind()); ConsumeToken(); + if (TakesArgs) { + if (!Tok.is(tok::l_paren)) + Diag(Tok.getLocation(), diag::err_expected_lparen_after) << AttrName; + else + ParseAttributeArgsCommon(AttrName, Loc, Attrs, EndLoc, + /*ScopeName*/ nullptr, + /*ScopeLoc*/ Loc, Form); + } else + Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Form); return; } @@ -4677,11 +4693,13 @@ SourceLocation Parser::SkipCXX11Attributes() { T.consumeOpen(); T.skipToEnd(); EndLoc = T.getCloseLocation(); - } else if (Tok.isRegularKeywordAttribute()) { + } else if (Tok.isRegularKeywordAttribute() && + !doesKeywordAttributeTakeArgs(Tok.getKind())) { EndLoc = Tok.getLocation(); ConsumeToken(); } else { - assert(Tok.is(tok::kw_alignas) && "not an attribute specifier"); + assert((Tok.is(tok::kw_alignas) || Tok.isRegularKeywordAttribute()) && + "not an attribute specifier"); ConsumeToken(); BalancedDelimiterTracker T(*this, tok::l_paren); if (!T.consumeOpen()) diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index 664337052500..449be3425411 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -883,7 +883,8 @@ bool Parser::TrySkipAttributes() { // Note that explicitly checking for `[[` and `]]` allows to fail as // expected in the case of the Objective-C message send syntax. ConsumeBracket(); - } else if (Tok.isRegularKeywordAttribute()) { + } else if (Tok.isRegularKeywordAttribute() && + !doesKeywordAttributeTakeArgs(Tok.getKind())) { ConsumeToken(); } else { ConsumeToken(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 51113bac126d..c21395515f7c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3053,11 +3053,15 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, } static bool hasSMEZAState(const FunctionDecl *FD) { - if (FD->hasAttr()) - return true; - if (const auto *T = FD->getType()->getAs()) - if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + if (auto *Attr = FD->getAttr()) + if (Attr->isNewZA()) + return true; + if (const auto *T = FD->getType()->getAs()) { + FunctionType::ArmStateValue State = + FunctionType::getArmZAState(T->getAArch64SMEAttributes()); + if (State != FunctionType::ARM_None) return true; + } return false; } @@ -7273,14 +7277,19 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, // If the callee uses AArch64 SME ZA state but the caller doesn't define // any, then this is an error. - if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask) { + FunctionType::ArmStateValue ArmZAState = + FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes); + if (ArmZAState != FunctionType::ARM_None) { bool CallerHasZAState = false; if (const auto *CallerFD = dyn_cast(CurContext)) { - if (CallerFD->hasAttr()) + auto *Attr = CallerFD->getAttr(); + if (Attr && Attr->isNewZA()) CallerHasZAState = true; - else if (const auto *FPT = CallerFD->getType()->getAs()) - CallerHasZAState = FPT->getExtProtoInfo().AArch64SMEAttributes & - FunctionType::SME_PStateZASharedMask; + else if (const auto *FPT = + CallerFD->getType()->getAs()) + CallerHasZAState = FunctionType::getArmZAState( + FPT->getExtProtoInfo().AArch64SMEAttributes) != + FunctionType::ARM_None; } if (!CallerHasZAState) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 9a4153feb10b..f7503e1e8e02 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3760,8 +3760,7 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S, // It is not permitted to redeclare an SME function with different SME // attributes. - if (IsInvalidSMECallConversion(Old->getType(), New->getType(), - AArch64SMECallConversionKind::MatchExactly)) { + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { Diag(New->getLocation(), diag::err_sme_attr_mismatch) << New->getType() << Old->getType(); Diag(OldLocation, diag::note_previous_declaration); @@ -12095,13 +12094,15 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, // Check if the function definition uses any AArch64 SME features without // having the '+sme' feature enabled. if (DeclIsDefn) { + const auto *Attr = NewFD->getAttr(); bool UsesSM = NewFD->hasAttr(); - bool UsesZA = NewFD->hasAttr(); + bool UsesZA = Attr && Attr->isNewZA(); if (const auto *FPT = NewFD->getType()->getAs()) { FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo(); UsesSM |= EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask; - UsesZA |= EPI.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask; + UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) != + FunctionType::ARM_None; } if (UsesSM || UsesZA) { diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 72d6944dce93..87c89299ace6 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8714,26 +8714,74 @@ static bool MustDelayAttributeArguments(const ParsedAttr &AL) { return false; } +static bool checkArmNewAttrMutualExclusion( + Sema &S, const ParsedAttr &AL, const FunctionProtoType *FPT, + FunctionType::ArmStateValue CurrentState, StringRef StateName) { + auto CheckForIncompatibleAttr = + [&](FunctionType::ArmStateValue IncompatibleState, + StringRef IncompatibleStateName) { + if (CurrentState == IncompatibleState) { + S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) + << (std::string("'__arm_new(\"") + StateName.str() + "\")'") + << (std::string("'") + IncompatibleStateName.str() + "(\"" + + StateName.str() + "\")'") + << true; + AL.setInvalid(); + } + }; -static void handleArmNewZaAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - if (auto *FPT = dyn_cast(D->getFunctionType())) { - if (FPT->getAArch64SMEAttributes() & - FunctionType::SME_PStateZASharedMask) { - S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) - << AL << "'__arm_shared_za'" << true; + CheckForIncompatibleAttr(FunctionType::ARM_In, "__arm_in"); + CheckForIncompatibleAttr(FunctionType::ARM_Out, "__arm_out"); + CheckForIncompatibleAttr(FunctionType::ARM_InOut, "__arm_inout"); + CheckForIncompatibleAttr(FunctionType::ARM_Preserves, "__arm_preserves"); + return AL.isInvalid(); +} + +static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) { + if (!AL.getNumArgs()) { + S.Diag(AL.getLoc(), diag::err_missing_arm_state) << AL; + AL.setInvalid(); + return; + } + + std::vector NewState; + if (const auto *ExistingAttr = D->getAttr()) { + for (StringRef S : ExistingAttr->newArgs()) + NewState.push_back(S); + } + + bool HasZA = false; + for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) { + StringRef StateName; + SourceLocation LiteralLoc; + if (!S.checkStringLiteralArgumentAttr(AL, I, StateName, &LiteralLoc)) + return; + + if (StateName == "za") + HasZA = true; + else { + S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName; AL.setInvalid(); + return; } - if (FPT->getAArch64SMEAttributes() & - FunctionType::SME_PStateZAPreservedMask) { - S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible) - << AL << "'__arm_preserves_za'" << true; - AL.setInvalid(); + + if (std::find(NewState.begin(), NewState.end(), StateName) == + NewState.end()) { // Avoid adding duplicates. + NewState.push_back(StateName); } - if (AL.isInvalid()) + } + + if (auto *FPT = dyn_cast(D->getFunctionType())) { + FunctionType::ArmStateValue ZAState = + FunctionType::getArmZAState(FPT->getAArch64SMEAttributes()); + if (HasZA && ZAState != FunctionType::ARM_None && + checkArmNewAttrMutualExclusion(S, AL, FPT, ZAState, "za")) return; } - handleSimpleAttribute(S, D, AL); + D->dropAttr(); + D->addAttr(::new (S.Context) + ArmNewAttr(S.Context, AL, NewState.data(), NewState.size())); } /// ProcessDeclAttribute - Apply the specific attribute to the specified decl if @@ -9495,8 +9543,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, handleSimpleAttribute(S, D, AL); break; - case ParsedAttr::AT_ArmNewZA: - handleArmNewZaAttr(S, D, AL); + case ParsedAttr::AT_ArmNew: + handleArmNewAttr(S, D, AL); break; case ParsedAttr::AT_AcquireHandle: diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index d42588932629..a978ee0da6a0 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17986,9 +17986,7 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New, } // SME attributes must match when overriding a function declaration. - if (IsInvalidSMECallConversion( - Old->getType(), New->getType(), - AArch64SMECallConversionKind::MayAddPreservesZA)) { + if (IsInvalidSMECallConversion(Old->getType(), New->getType())) { Diag(New->getLocation(), diag::err_conflicting_overriding_attributes) << New << New->getType() << Old->getType(); Diag(Old->getLocation(), diag::note_overridden_virtual_function); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e82392069fa7..8804076f4d11 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -9680,8 +9680,7 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc, } // Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible. -bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType, - AArch64SMECallConversionKind C) { +bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) { unsigned FromAttributes = 0, ToAttributes = 0; if (const auto *FromFn = dyn_cast(Context.getCanonicalType(FromType))) @@ -9692,25 +9691,7 @@ bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType, ToAttributes = ToFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask; - if (FromAttributes == ToAttributes) - return false; - - // If the '__arm_preserves_za' is the only difference between the types, - // check whether we're allowed to add or remove it. - if ((FromAttributes ^ ToAttributes) == - FunctionType::SME_PStateZAPreservedMask) { - switch (C) { - case AArch64SMECallConversionKind::MatchExactly: - return true; - case AArch64SMECallConversionKind::MayAddPreservesZA: - return !(ToAttributes & FunctionType::SME_PStateZAPreservedMask); - case AArch64SMECallConversionKind::MayDropPreservesZA: - return !(FromAttributes & FunctionType::SME_PStateZAPreservedMask); - } - } - - // There has been a mismatch of attributes - return true; + return FromAttributes != ToAttributes; } // Check if we have a conversion between incompatible cmse function pointer @@ -9879,9 +9860,7 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType, return Sema::IncompatibleFunctionPointer; if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans)) return Sema::IncompatibleFunctionPointer; - if (S.IsInvalidSMECallConversion( - rtrans, ltrans, - Sema::AArch64SMECallConversionKind::MayDropPreservesZA)) + if (S.IsInvalidSMECallConversion(rtrans, ltrans)) return Sema::IncompatibleFunctionPointer; return ConvTy; } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 37cec76ec116..aef8dc58a48d 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1682,26 +1682,6 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType, Changed = true; } - // Drop the 'arm_preserves_za' if not present in the target type (we can do - // that because it is merely a hint). - if (const auto *FromFPT = dyn_cast(FromFn)) { - FunctionProtoType::ExtProtoInfo ExtInfo = FromFPT->getExtProtoInfo(); - if (ExtInfo.AArch64SMEAttributes & - FunctionType::SME_PStateZAPreservedMask) { - unsigned ToFlags = 0; - if (const auto *ToFPT = dyn_cast(ToFn)) - ToFlags = ToFPT->getExtProtoInfo().AArch64SMEAttributes; - if (!(ToFlags & FunctionType::SME_PStateZAPreservedMask)) { - ExtInfo.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask, - false); - QualType QT = Context.getFunctionType( - FromFPT->getReturnType(), FromFPT->getParamTypes(), ExtInfo); - FromFn = QT->getAs(); - Changed = true; - } - } - } - // Drop 'noexcept' if not present in target type. if (const auto *FromFPT = dyn_cast(FromFn)) { const auto *ToFPT = cast(ToFn); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 2ccad19e066d..61e2ada139c7 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -144,8 +144,10 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, case ParsedAttr::AT_CmseNSCall: \ case ParsedAttr::AT_ArmStreaming: \ case ParsedAttr::AT_ArmStreamingCompatible: \ - case ParsedAttr::AT_ArmSharedZA: \ - case ParsedAttr::AT_ArmPreservesZA: \ + case ParsedAttr::AT_ArmPreserves: \ + case ParsedAttr::AT_ArmIn: \ + case ParsedAttr::AT_ArmOut: \ + case ParsedAttr::AT_ArmInOut: \ case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \ case ParsedAttr::AT_AnyX86NoCfCheck: \ CALLING_CONV_ATTRS_CASELIST @@ -7795,6 +7797,49 @@ static bool checkMutualExclusion(TypeProcessingState &state, return true; } +static bool handleArmStateAttribute(Sema &S, + FunctionProtoType::ExtProtoInfo &EPI, + ParsedAttr &Attr, + FunctionType::ArmStateValue State) { + if (!Attr.getNumArgs()) { + S.Diag(Attr.getLoc(), diag::err_missing_arm_state) << Attr; + Attr.setInvalid(); + return true; + } + + for (unsigned I = 0; I < Attr.getNumArgs(); ++I) { + StringRef StateName; + SourceLocation LiteralLoc; + if (!S.checkStringLiteralArgumentAttr(Attr, I, StateName, &LiteralLoc)) + return true; + + unsigned Shift; + FunctionType::ArmStateValue ExistingState; + if (StateName == "za") { + Shift = FunctionType::SME_ZAShift; + ExistingState = FunctionType::getArmZAState(EPI.AArch64SMEAttributes); + } else { + S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName; + Attr.setInvalid(); + return true; + } + + // __arm_in(S), __arm_out(S), __arm_inout(S) and __arm_preserves(S) + // are all mutually exclusive for the same S, so check if there are + // conflicting attributes. + if (ExistingState != FunctionType::ARM_None && ExistingState != State) { + S.Diag(LiteralLoc, diag::err_conflicting_attributes_arm_state) + << StateName; + Attr.setInvalid(); + return true; + } + + EPI.setArmSMEAttribute( + (FunctionType::AArch64SMETypeAttributes)((State << Shift))); + } + return false; +} + /// Process an individual function attribute. Returns true to /// indicate that the attribute was handled, false if it wasn't. static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, @@ -7926,11 +7971,18 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, if (attr.getKind() == ParsedAttr::AT_ArmStreaming || attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible || - attr.getKind() == ParsedAttr::AT_ArmSharedZA || - attr.getKind() == ParsedAttr::AT_ArmPreservesZA){ - if (S.CheckAttrTarget(attr) || S.CheckAttrNoArgs(attr)) + attr.getKind() == ParsedAttr::AT_ArmPreserves || + attr.getKind() == ParsedAttr::AT_ArmIn || + attr.getKind() == ParsedAttr::AT_ArmOut || + attr.getKind() == ParsedAttr::AT_ArmInOut) { + if (S.CheckAttrTarget(attr)) return true; + if (attr.getKind() == ParsedAttr::AT_ArmStreaming || + attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible) + if (S.CheckAttrNoArgs(attr)) + return true; + if (!unwrapped.isFunctionType()) return false; @@ -7957,11 +8009,21 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, return true; EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask); break; - case ParsedAttr::AT_ArmSharedZA: - EPI.setArmSMEAttribute(FunctionType::SME_PStateZASharedMask); + case ParsedAttr::AT_ArmPreserves: + if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Preserves)) + return true; + break; + case ParsedAttr::AT_ArmIn: + if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_In)) + return true; break; - case ParsedAttr::AT_ArmPreservesZA: - EPI.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask); + case ParsedAttr::AT_ArmOut: + if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Out)) + return true; + break; + case ParsedAttr::AT_ArmInOut: + if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_InOut)) + return true; break; default: llvm_unreachable("Unsupported attribute"); diff --git a/clang/test/AST/ast-dump-sme-attributes.cpp b/clang/test/AST/ast-dump-sme-attributes.cpp index 6581fd4baba9..133648d90a15 100644 --- a/clang/test/AST/ast-dump-sme-attributes.cpp +++ b/clang/test/AST/ast-dump-sme-attributes.cpp @@ -13,16 +13,16 @@ struct Foo { // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming_compatible 'void () __arm_streaming_compatible' // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_locally_streaming 'void ()' // CHECK-NEXT: | `-ArmLocallyStreamingAttr -// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_shared_za' +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_inout("za")' // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_new_za 'void ()' -// CHECK-NEXT: | `-ArmNewZAAttr -// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves_za' +// CHECK-NEXT: | `-ArmNewAttr {{.*}} za +// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves("za")' void f_streaming() __arm_streaming; void f_streaming_compatible() __arm_streaming_compatible; __arm_locally_streaming void f_locally_streaming(); - void f_shared_za() __arm_shared_za; - __arm_new_za void f_new_za(); - void f_preserves_za() __arm_preserves_za; + void f_shared_za() __arm_inout("za"); + __arm_new("za") void f_new_za(); + void f_preserves_za() __arm_preserves("za"); // CHECK: |-CXXMethodDecl {{.*}} test_lambda 'int (int)' implicit-inline diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp index 0768bfc33238..f69703a8a7d8 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -12,8 +12,8 @@ extern int normal_callee(); int streaming_decl(void) __arm_streaming; int streaming_compatible_decl(void) __arm_streaming_compatible; -int shared_za_decl(void) __arm_shared_za; -int preserves_za_decl(void) __arm_preserves_za; +int shared_za_decl(void) __arm_inout("za"); +int preserves_za_decl(void) __arm_preserves("za"); int private_za_decl(void); // == FUNCTION DEFINITIONS == @@ -78,7 +78,7 @@ __arm_locally_streaming int locally_streaming_callee() { // CHECK-SAME: #[[ZA_SHARED:[0-9]+]] // CHECK: call i32 @normal_callee() // - int shared_za_caller() __arm_shared_za { + int shared_za_caller() __arm_inout("za") { return normal_callee(); } @@ -86,7 +86,7 @@ __arm_locally_streaming int locally_streaming_callee() { // CHECK-SAME: #[[ZA_SHARED]] // CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]] // - int shared_za_callee() __arm_shared_za { + int shared_za_callee() __arm_inout("za") { return shared_za_decl(); } @@ -97,7 +97,7 @@ __arm_locally_streaming int locally_streaming_callee() { // CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]] // CHECK: call i32 @normal_callee() // - int preserves_za_caller() __arm_preserves_za { + int preserves_za_caller() __arm_preserves("za") { return normal_callee(); } @@ -105,7 +105,7 @@ __arm_locally_streaming int locally_streaming_callee() { // CHECK-SAME: #[[ZA_PRESERVED]] // CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]] // - int preserves_za_callee() __arm_preserves_za { + int preserves_za_callee() __arm_preserves("za") { return preserves_za_decl(); } @@ -116,7 +116,7 @@ __arm_locally_streaming int locally_streaming_callee() { // CHECK-SAME: #[[ZA_NEW:[0-9]+]] // CHECK: call i32 @normal_callee() // -__arm_new_za int new_za_caller() { +__arm_new("za") int new_za_caller() { return normal_callee(); } @@ -124,7 +124,7 @@ __arm_new_za int new_za_caller() { // CHECK-SAME: #[[ZA_NEW]] // CHECK: call i32 @private_za_decl() // -__arm_new_za int new_za_callee() { +__arm_new("za") int new_za_callee() { return private_za_decl(); } @@ -135,8 +135,8 @@ __arm_new_za int new_za_callee() { // and also to callsites. typedef void (*s_ptrty) (int, int) __arm_streaming; typedef void (*sc_ptrty) (int, int) __arm_streaming_compatible; -typedef void (*sz_ptrty) (int, int) __arm_shared_za; -typedef void (*pz_ptrty) (int, int) __arm_preserves_za; +typedef void (*sz_ptrty) (int, int) __arm_inout("za"); +typedef void (*pz_ptrty) (int, int) __arm_preserves("za"); // CHECK-LABEL: @test_streaming_ptrty( // CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] @@ -152,12 +152,12 @@ void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); // CHECK-SAME: #[[ZA_SHARED]] // CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]] // -void test_shared_za(sz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); } +void test_shared_za(sz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); } // CHECK-LABEL: @test_preserved_za( // CHECK-SAME: #[[ZA_SHARED]] // CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]] // -void test_preserved_za(pz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); } +void test_preserved_za(pz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); } // CHECK-LABEL: @test_indirect_streaming_ptrty( // CHECK-SAME: #[[NORMAL_DEF:[0-9]+]] @@ -255,7 +255,7 @@ int call() { return 0; } template __attribute__((always_inline)) -int call(T f, Other... other) __arm_shared_za { +int call(T f, Other... other) __arm_inout("za") { return f() + call(other...); } @@ -270,7 +270,7 @@ int call(T f, Other... other) __arm_shared_za { // CHECK-NEXT: add nsw // CHECK-NEXT: add nsw // CHECK-NEXT: ret -int test_variadic_template() __arm_shared_za { +int test_variadic_template() __arm_inout("za") { return call(normal_callee, streaming_decl, streaming_compatible_decl, @@ -286,18 +286,18 @@ int test_variadic_template() __arm_shared_za { // CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } // CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" } -// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" } +// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } // CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind } // CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" } // CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" } // CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_pstate_za_shared" } -// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" } +// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index 6cf7ebd2c16b..08301009df47 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index 1bab5a3f7597..b8836bec6400 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn); } @@ -33,7 +33,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn); } @@ -45,7 +45,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn); } @@ -57,7 +57,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn); } @@ -69,7 +69,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn); } @@ -81,7 +81,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn); } @@ -93,7 +93,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn); } @@ -105,6 +105,6 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami // CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index e12c5bbc9895..e3c727941ccc 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -13,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_hor_za8(0, slice_base, pg, ptr); svld1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -27,7 +27,7 @@ void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_hor_za16(0, slice_base, pg, ptr); svld1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -41,7 +41,7 @@ void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_hor_za32(0, slice_base, pg, ptr); svld1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -55,7 +55,7 @@ void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_hor_za64(0, slice_base, pg, ptr); svld1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -68,7 +68,7 @@ void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_hor_za128(0, slice_base, pg, ptr); svld1_hor_za128(15, slice_base, pg, ptr); } @@ -81,7 +81,7 @@ void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_ver_za8(0, slice_base, pg, ptr); svld1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -95,7 +95,7 @@ void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_ver_za16(0, slice_base, pg, ptr); svld1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -109,7 +109,7 @@ void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_ver_za32(0, slice_base, pg, ptr); svld1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -123,7 +123,7 @@ void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_ver_za64(0, slice_base, pg, ptr); svld1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -136,7 +136,7 @@ void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za { +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") { svld1_ver_za128(0, slice_base, pg, ptr); svld1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 09b1acb19108..34191bf799f3 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -16,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -33,7 +33,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -50,7 +50,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -67,7 +67,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -83,7 +83,7 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -99,7 +99,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -116,7 +116,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -133,7 +133,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -150,7 +150,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -166,7 +166,7 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index e19f8f927e98..5f5b40a5ccf9 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za { +void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_out("za") { svldr_vnum_za(slice_base, ptr, 0); } @@ -21,7 +21,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za { +void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_out("za") { svldr_vnum_za(slice_base, ptr, 15); } @@ -31,7 +31,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za { +void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_out("za") { svldr_za(slice_base, ptr); } @@ -42,7 +42,7 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_shared_za { +void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_out("za") { svldr_vnum_za(slice_base, ptr, vnum); } @@ -52,6 +52,6 @@ void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) // CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) // CHECK-NEXT: ret void // -void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_shared_za { +void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_out("za") { svldr_vnum_za(slice_base, ptr, 16); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index e762faa6ca4e..08f945001d07 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { +void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { +void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index e9789b96b1b0..42f09516a74f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za { +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { +void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { +void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index 9256b8fadc7b..5629c59d66d6 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -29,7 +29,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm); } @@ -41,7 +41,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm); } @@ -53,7 +53,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16 // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm); } @@ -65,7 +65,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm); } @@ -75,7 +75,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za { +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm); } @@ -85,6 +85,6 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za { +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index d2e4d2a5c09e..c1abc3a30079 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -21,7 +21,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm); } @@ -33,7 +33,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) // CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm); } @@ -45,7 +45,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm // CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za { +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm); } @@ -57,7 +57,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za { +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm); } @@ -69,6 +69,6 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) // CHECK-NEXT: ret void // -void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za { +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index efc1f536c2e7..6748f8602206 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -19,7 +19,7 @@ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -30,7 +30,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice); } @@ -42,7 +42,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -54,7 +54,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice); } @@ -66,7 +66,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -78,7 +78,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice); } @@ -90,7 +90,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -102,7 +102,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice); } @@ -113,7 +113,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -124,7 +124,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice); } @@ -136,7 +136,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -148,7 +148,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice); } @@ -160,7 +160,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -172,7 +172,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice); } @@ -184,7 +184,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -196,7 +196,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice); } @@ -208,7 +208,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -220,7 +220,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice); } @@ -232,7 +232,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -244,7 +244,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -256,7 +256,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -268,7 +268,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice); } @@ -280,7 +280,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -292,7 +292,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice); } @@ -303,7 +303,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -313,7 +313,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -324,7 +324,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -335,7 +335,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -346,7 +346,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -357,7 +357,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -368,7 +368,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -379,7 +379,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -389,7 +389,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -399,7 +399,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -410,7 +410,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -421,7 +421,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -432,7 +432,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -443,7 +443,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -454,7 +454,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -465,7 +465,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -476,7 +476,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -487,7 +487,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -498,7 +498,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -509,7 +509,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -520,7 +520,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -531,7 +531,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -542,7 +542,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -553,7 +553,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base); } @@ -563,7 +563,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base); } @@ -574,7 +574,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice); } @@ -586,7 +586,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base); } @@ -598,7 +598,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice); } @@ -610,7 +610,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base); } @@ -622,7 +622,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice); } @@ -634,7 +634,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base); } @@ -646,7 +646,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice); } @@ -657,7 +657,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base); } @@ -668,7 +668,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 15; return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice); } @@ -680,7 +680,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base); } @@ -692,7 +692,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice); } @@ -704,7 +704,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base); } @@ -716,7 +716,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice); } @@ -728,7 +728,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base); } @@ -740,7 +740,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice); } @@ -752,7 +752,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base); } @@ -764,7 +764,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice); } @@ -776,7 +776,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base); } @@ -788,7 +788,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 7; return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice); } @@ -800,7 +800,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base); } @@ -812,7 +812,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 3; return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice); } @@ -824,7 +824,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base); } @@ -836,7 +836,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TILESLICE]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { uint32_t slice = slice_base + 1; return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice); } @@ -847,7 +847,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base); } @@ -857,7 +857,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base); } @@ -868,7 +868,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base); } @@ -879,7 +879,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base); } @@ -890,7 +890,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base); } @@ -901,7 +901,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base); } @@ -912,7 +912,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base); } @@ -923,7 +923,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base); } @@ -933,7 +933,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base); } @@ -943,7 +943,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base); } @@ -954,7 +954,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base); } @@ -965,7 +965,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base); } @@ -976,7 +976,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base); } @@ -987,7 +987,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base); } @@ -998,7 +998,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base); } @@ -1009,7 +1009,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_ // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base); } @@ -1020,7 +1020,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base); } @@ -1031,7 +1031,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base); } @@ -1042,7 +1042,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base); } @@ -1053,7 +1053,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base); } @@ -1064,7 +1064,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base); } @@ -1075,7 +1075,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base); } @@ -1086,7 +1086,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base); } @@ -1097,6 +1097,6 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) // CHECK-NEXT: ret [[TMP1]] // -svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za { +svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") { return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index 052eceffccda..b418f21c5cf8 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -13,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_hor_za8(0, slice_base, pg, ptr); svst1_hor_za8(0, slice_base + 15, pg, ptr); } @@ -27,7 +27,7 @@ void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_hor_za16(0, slice_base, pg, ptr); svst1_hor_za16(1, slice_base + 7, pg, ptr); } @@ -41,7 +41,7 @@ void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_hor_za32(0, slice_base, pg, ptr); svst1_hor_za32(3, slice_base + 3, pg, ptr); } @@ -55,7 +55,7 @@ void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_hor_za64(0, slice_base, pg, ptr); svst1_hor_za64(7, slice_base + 1, pg, ptr); } @@ -68,7 +68,7 @@ void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_hor_za128(0, slice_base, pg, ptr); svst1_hor_za128(15, slice_base, pg, ptr); } @@ -81,7 +81,7 @@ void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_str // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_ver_za8(0, slice_base, pg, ptr); svst1_ver_za8(0, slice_base + 15, pg, ptr); } @@ -95,7 +95,7 @@ void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_ver_za16(0, slice_base, pg, ptr); svst1_ver_za16(1, slice_base + 7, pg, ptr); } @@ -109,7 +109,7 @@ void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_ver_za32(0, slice_base, pg, ptr); svst1_ver_za32(3, slice_base + 3, pg, ptr); } @@ -123,7 +123,7 @@ void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_ver_za64(0, slice_base, pg, ptr); svst1_ver_za64(7, slice_base + 1, pg, ptr); } @@ -136,7 +136,7 @@ void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") { svst1_ver_za128(0, slice_base, pg, ptr); svst1_ver_za128(15, slice_base, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index 3892050d8855..d346ec346e61 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -16,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -33,7 +33,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -50,7 +50,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -67,7 +67,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -83,7 +83,7 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum); } @@ -99,7 +99,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum); } @@ -116,7 +116,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum); } @@ -133,7 +133,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum); } @@ -150,7 +150,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum); } @@ -166,7 +166,7 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za { +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum); svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c index 282819c8ca35..c3e4967bfe9b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -66,7 +66,7 @@ bool test_has_sme(void) __arm_streaming_compatible { // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: ret void // -void test_svundef_za(void) __arm_streaming_compatible __arm_shared_za { +void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") { svundef_za(); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 5256b63907ae..23dcd9c12a8d 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za { +void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_in("za") { svstr_vnum_za(slice_base, ptr, 0); } @@ -21,7 +21,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za { +void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_in("za") { svstr_vnum_za(slice_base, ptr, 15); } @@ -31,7 +31,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // -void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za { +void test_svstr_za(uint32_t slice_base, void *ptr) __arm_in("za") { svstr_za(slice_base, ptr); } @@ -42,7 +42,7 @@ void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_shared_za { +void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_in("za") { svstr_vnum_za(slice_base, ptr, vnum); } @@ -52,6 +52,6 @@ void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) // CHECK-NEXT: ret void // -void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_shared_za { +void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_in("za") { svstr_vnum_za(slice_base, ptr, 16); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index 7a2dafcce66d..f4ac53291400 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -19,7 +19,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -30,7 +30,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn); } @@ -42,7 +42,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -54,7 +54,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn); } @@ -66,7 +66,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -78,7 +78,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn); } @@ -90,7 +90,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -102,7 +102,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn); } @@ -113,7 +113,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -124,7 +124,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn); } @@ -136,7 +136,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -148,7 +148,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn); } @@ -160,7 +160,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -172,7 +172,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn); } @@ -184,7 +184,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -196,7 +196,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn); } @@ -208,7 +208,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -220,7 +220,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn); } @@ -232,7 +232,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -244,7 +244,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn); } @@ -256,7 +256,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -268,7 +268,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn); } @@ -280,7 +280,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -292,7 +292,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn); } @@ -303,7 +303,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -313,7 +313,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -324,7 +324,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -335,7 +335,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -346,7 +346,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -357,7 +357,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -368,7 +368,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -379,7 +379,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -389,7 +389,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -399,7 +399,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -410,7 +410,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -421,7 +421,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -432,7 +432,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -443,7 +443,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -454,7 +454,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -465,7 +465,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -476,7 +476,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -487,7 +487,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -498,7 +498,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -509,7 +509,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -520,7 +520,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -531,7 +531,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -542,7 +542,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -553,7 +553,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn); } @@ -563,7 +563,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn); } @@ -574,7 +574,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn); } @@ -586,7 +586,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn); } @@ -598,7 +598,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn); } @@ -610,7 +610,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn); } @@ -622,7 +622,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn); } @@ -634,7 +634,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn); } @@ -646,7 +646,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn); } @@ -657,7 +657,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn); } @@ -668,7 +668,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 15; SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn); } @@ -680,7 +680,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn); } @@ -692,7 +692,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn); } @@ -704,7 +704,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn); } @@ -716,7 +716,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn); } @@ -728,7 +728,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn); } @@ -740,7 +740,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn); } @@ -752,7 +752,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn); } @@ -764,7 +764,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn); } @@ -776,7 +776,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn); } @@ -788,7 +788,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 7; SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn); } @@ -800,7 +800,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn); } @@ -812,7 +812,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 3; SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn); } @@ -824,7 +824,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn); } @@ -836,7 +836,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { uint32_t slice = slice_base + 1; SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn); } @@ -847,7 +847,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn); } @@ -857,7 +857,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn); } @@ -868,7 +868,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn); } @@ -879,7 +879,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn); } @@ -890,7 +890,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn); } @@ -901,7 +901,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn); } @@ -912,7 +912,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn); } @@ -923,7 +923,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn); } @@ -933,7 +933,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn); } @@ -943,7 +943,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn); } @@ -954,7 +954,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn); } @@ -965,7 +965,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn); } @@ -976,7 +976,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn); } @@ -987,7 +987,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn); } @@ -998,7 +998,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn); } @@ -1009,7 +1009,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn); } @@ -1020,7 +1020,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn); } @@ -1031,7 +1031,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn); } @@ -1042,7 +1042,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn); } @@ -1053,7 +1053,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn); } @@ -1064,7 +1064,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_ // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn); } @@ -1075,7 +1075,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn); } @@ -1086,7 +1086,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn); } @@ -1097,6 +1097,6 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn // CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) // CHECK-NEXT: ret void // -void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za { +void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") { SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index 9aebe9d42cbf..0c157af1cdc9 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -11,7 +11,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 0) // CHECK-NEXT: ret void // -void test_svzero_mask_za(void) __arm_shared_za { +void test_svzero_mask_za(void) __arm_inout("za") { svzero_mask_za(0); } @@ -21,7 +21,7 @@ void test_svzero_mask_za(void) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 176) // CHECK-NEXT: ret void // -void test_svzero_mask_za_1(void) __arm_shared_za { +void test_svzero_mask_za_1(void) __arm_inout("za") { svzero_mask_za(176); } @@ -31,7 +31,7 @@ void test_svzero_mask_za_1(void) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_mask_za_2(void) __arm_shared_za { +void test_svzero_mask_za_2(void) __arm_inout("za") { svzero_mask_za(255); } @@ -41,6 +41,6 @@ void test_svzero_mask_za_2(void) __arm_shared_za { // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_za(void) __arm_shared_za { +void test_svzero_za(void) __arm_out("za") { svzero_za(); } diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c index dd96dca70d63..4249e9c6933a 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -36,7 +36,7 @@ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); } @@ -54,7 +54,7 @@ void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); } @@ -72,7 +72,7 @@ void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); } @@ -90,7 +90,7 @@ void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); } @@ -114,7 +114,7 @@ void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); } @@ -136,7 +136,7 @@ void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); } @@ -158,7 +158,7 @@ void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); } @@ -180,7 +180,7 @@ void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); } @@ -208,7 +208,7 @@ void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); } @@ -230,7 +230,7 @@ void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); } @@ -252,7 +252,7 @@ void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); } @@ -274,7 +274,7 @@ void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); } @@ -306,7 +306,7 @@ void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); } @@ -336,7 +336,7 @@ void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); } @@ -366,7 +366,7 @@ void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); } @@ -396,7 +396,7 @@ void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { +void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); } @@ -420,7 +420,7 @@ void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn); } @@ -438,7 +438,7 @@ void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn); } @@ -456,7 +456,7 @@ void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn); } @@ -474,7 +474,7 @@ void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn); } @@ -492,7 +492,7 @@ void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn); } @@ -510,7 +510,7 @@ void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn); } @@ -534,7 +534,7 @@ void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn); } @@ -556,7 +556,7 @@ void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn); } @@ -578,7 +578,7 @@ void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn); } @@ -600,7 +600,7 @@ void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn); } @@ -622,7 +622,7 @@ void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn); } @@ -644,6 +644,6 @@ void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { +void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn); } diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c index 9570deab0b39..3f0a36db313b 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -36,7 +36,7 @@ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); } @@ -54,7 +54,7 @@ void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); } @@ -72,7 +72,7 @@ void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); } @@ -90,7 +90,7 @@ void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); } @@ -114,7 +114,7 @@ void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); } @@ -136,7 +136,7 @@ void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); } @@ -158,7 +158,7 @@ void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); } @@ -180,7 +180,7 @@ void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); } @@ -208,7 +208,7 @@ void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64 // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); } @@ -230,7 +230,7 @@ void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); } @@ -252,7 +252,7 @@ void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); } @@ -274,7 +274,7 @@ void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); } @@ -306,7 +306,7 @@ void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); } @@ -336,7 +336,7 @@ void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); } @@ -366,7 +366,7 @@ void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); } @@ -396,7 +396,7 @@ void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_ // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za { +void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); } @@ -420,7 +420,7 @@ void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn); } @@ -438,7 +438,7 @@ void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn); } @@ -456,7 +456,7 @@ void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn); } @@ -474,7 +474,7 @@ void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn); } @@ -492,7 +492,7 @@ void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn); } @@ -510,7 +510,7 @@ void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn); } @@ -534,7 +534,7 @@ void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn); } @@ -556,7 +556,7 @@ void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn); } @@ -578,7 +578,7 @@ void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn); } @@ -600,7 +600,7 @@ void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_strea // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn); } @@ -622,7 +622,7 @@ void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_stre // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn); } @@ -644,6 +644,6 @@ void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_stream // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) // CPP-CHECK-NEXT: ret void // -void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za { +void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn); } diff --git a/clang/test/Modules/aarch64-sme-keywords.cppm b/clang/test/Modules/aarch64-sme-keywords.cppm index 6784aaa01d21..df4dd32b16cf 100644 --- a/clang/test/Modules/aarch64-sme-keywords.cppm +++ b/clang/test/Modules/aarch64-sme-keywords.cppm @@ -13,8 +13,8 @@ export module A; export void f_streaming(void) __arm_streaming { } export void f_streaming_compatible(void) __arm_streaming_compatible { } -export void f_shared_za(void) __arm_shared_za { } -export void f_preserves_za(void) __arm_preserves_za { } +export void f_shared_za(void) __arm_inout("za") { } +export void f_preserves_za(void) __arm_preserves("za") { } //--- Use.cpp // expected-no-diagnostics @@ -50,11 +50,11 @@ import A; // CHECK-DAG: attributes #[[STREAMING_DECL]] = {{{.*}} "aarch64_pstate_sm_enabled" {{.*}}} // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_DECL]] = {{{.*}} "aarch64_pstate_sm_compatible" {{.*}}} // CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_pstate_za_shared" } -// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" } +// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } // CHECK-DAG: attributes #[[STREAMING_USE]] = { "aarch64_pstate_sm_enabled" } // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_USE]] = { "aarch64_pstate_sm_compatible" } -void f_shared_za_caller(void) __arm_shared_za { +void f_shared_za_caller(void) __arm_inout("za") { f_shared_za(); f_preserves_za(); } diff --git a/clang/test/Parser/c2x-attribute-keywords.c b/clang/test/Parser/c2x-attribute-keywords.c index d8291b710e6d..b88d2b9c23e6 100644 --- a/clang/test/Parser/c2x-attribute-keywords.c +++ b/clang/test/Parser/c2x-attribute-keywords.c @@ -1,60 +1,64 @@ -// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %s -// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %s - -enum __arm_streaming E { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - One __arm_streaming, // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +// RUN: sed -e "s@ATTR_USE@__arm_streaming@g" -e "s@ATTR_NAME@__arm_streaming@g" %s > %t +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t +// RUN: sed -e "s@ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s@ATTR_NAME@__arm_inout@g" %s > %t +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t + +enum ATTR_USE E { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + One ATTR_USE, // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} Two, - Three __arm_streaming // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} + Three ATTR_USE // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} }; -enum __arm_streaming { Four }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -__arm_streaming enum E2 { Five }; // expected-error {{misplaced '__arm_streaming'}} +enum ATTR_USE { Four }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +ATTR_USE enum E2 { Five }; // expected-error {{misplaced 'ATTR_NAME'}} // FIXME: this diagnostic can be improved. -enum { __arm_streaming Six }; // expected-error {{expected identifier}} +enum { ATTR_USE Six }; // expected-error {{expected identifier}} // FIXME: this diagnostic can be improved. -enum E3 __arm_streaming { Seven }; // expected-error {{expected identifier or '('}} - -struct __arm_streaming S1 { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - int i __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} - int __arm_streaming j; // expected-error {{'__arm_streaming' only applies to function types}} - int k[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} - int l __arm_streaming[10]; // expected-error {{'__arm_streaming' only applies to function types}} - __arm_streaming int m, n; // expected-error {{'__arm_streaming' only applies to function types}} - int o __arm_streaming : 12; // expected-error {{'__arm_streaming' only applies to function types}} - int __arm_streaming : 0; // expected-error {{'__arm_streaming' only applies to function types}} - int p, __arm_streaming : 0; // expected-error {{'__arm_streaming' cannot appear here}} - int q, __arm_streaming r; // expected-error {{'__arm_streaming' cannot appear here}} - __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}} \ +enum E3 ATTR_USE { Seven }; // expected-error {{expected identifier or '('}} + +struct ATTR_USE S1 { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + int i ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} + int ATTR_USE j; // expected-error {{'ATTR_NAME' only applies to function types}} + int k[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} + int l ATTR_USE[10]; // expected-error {{'ATTR_NAME' only applies to function types}} + ATTR_USE int m, n; // expected-error {{'ATTR_NAME' only applies to function types}} + int o ATTR_USE : 12; // expected-error {{'ATTR_NAME' only applies to function types}} + int ATTR_USE : 0; // expected-error {{'ATTR_NAME' only applies to function types}} + int p, ATTR_USE : 0; // expected-error {{'ATTR_NAME' cannot appear here}} + int q, ATTR_USE r; // expected-error {{'ATTR_NAME' cannot appear here}} + ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}} \ // expected-warning {{declaration does not declare anything}} }; -__arm_streaming struct S2 { int a; }; // expected-error {{misplaced '__arm_streaming'}} -struct S3 __arm_streaming { int a; }; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +ATTR_USE struct S2 { int a; }; // expected-error {{misplaced 'ATTR_NAME'}} +struct S3 ATTR_USE { int a; }; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} -union __arm_streaming U { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - double d __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types; type here is 'double'}} - __arm_streaming int i; // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}} +union ATTR_USE U { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + double d ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'double'}} + ATTR_USE int i; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}} }; -__arm_streaming union U2 { double d; }; // expected-error {{misplaced '__arm_streaming'}} -union U3 __arm_streaming { double d; }; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +ATTR_USE union U2 { double d; }; // expected-error {{misplaced 'ATTR_NAME'}} +union U3 ATTR_USE { double d; }; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} -struct __arm_streaming IncompleteStruct; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -union __arm_streaming IncompleteUnion; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming IncompleteEnum; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +struct ATTR_USE IncompleteStruct; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +union ATTR_USE IncompleteUnion; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE IncompleteEnum; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} -__arm_streaming void f1(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -void __arm_streaming f2(void); // expected-error {{'__arm_streaming' only applies to function types}} -void f3 __arm_streaming (void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -void f4(void) __arm_streaming; +ATTR_USE void f1(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} +void ATTR_USE f2(void); // expected-error {{'ATTR_NAME' only applies to function types}} +void f3 ATTR_USE (void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} +void f4(void) ATTR_USE; -void f5(int i __arm_streaming, __arm_streaming int j, int __arm_streaming k); // expected-error 3 {{'__arm_streaming' only applies to function types}} +void f5(int i ATTR_USE, ATTR_USE int j, int ATTR_USE k); // expected-error 3 {{'ATTR_NAME' only applies to function types}} -void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streaming' cannot appear here}} \ +void f6(a, b) ATTR_USE int a; int b; { // expected-error {{'ATTR_NAME' cannot appear here}} \ c2x-warning {{deprecated}} } @@ -63,57 +67,74 @@ void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streami // behavior given that we *don't* want to parse it as part of the K&R parameter // declarations. It is disallowed to avoid a parsing ambiguity we already // handle well. -int (*f7(a, b))(int, int) __arm_streaming int a; int b; { // c2x-warning {{deprecated}} +int (*f7(a, b))(int, int) ATTR_USE int a; int b; { // c2x-warning {{deprecated}} return 0; } -__arm_streaming int a, b; // expected-error {{'__arm_streaming' only applies to function types}} -int c __arm_streaming, d __arm_streaming; // expected-error 2 {{'__arm_streaming' only applies to function types}} +ATTR_USE int a, b; // expected-error {{'ATTR_NAME' only applies to function types}} +int c ATTR_USE, d ATTR_USE; // expected-error 2 {{'ATTR_NAME' only applies to function types}} -void f8(void) __arm_streaming { - __arm_streaming int i, j; // expected-error {{'__arm_streaming' only applies to function types}} - int k, l __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} +void f8(void) ATTR_USE { + ATTR_USE int i, j; // expected-error {{'ATTR_NAME' only applies to function types}} + int k, l ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} } -__arm_streaming void f9(void) { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} - int i[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} - int (*fp1)(void)__arm_streaming; - int (*fp2 __arm_streaming)(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} +ATTR_USE void f9(void) { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} + int i[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} + int (*fp1)(void)ATTR_USE; + int (*fp2 ATTR_USE)(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} - int * __arm_streaming *ipp; // expected-error {{'__arm_streaming' only applies to function types}} + int * ATTR_USE *ipp; // expected-error {{'ATTR_NAME' only applies to function types}} } -void f10(int j[static 10] __arm_streaming, int k[*] __arm_streaming); // expected-error 2 {{'__arm_streaming' only applies to function types}} +void f10(int j[static 10] ATTR_USE, int k[*] ATTR_USE); // expected-error 2 {{'ATTR_NAME' only applies to function types}} void f11(void) { - __arm_streaming {} // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming if (1) {} // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE if (1) {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}} - __arm_streaming switch (1) { // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming case 1: __arm_streaming break; // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming default: break; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE switch (1) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE case 1: ATTR_USE break; // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE default: break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} } goto foo; - __arm_streaming foo: (void)1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} + ATTR_USE foo: (void)1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} - __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming while (1); // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming do __arm_streaming { } while(1); // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE while (1); // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE do ATTR_USE { } while(1); // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}} - __arm_streaming (void)1; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE (void)1; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} - __arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} - (void)sizeof(int [4]__arm_streaming); // expected-error {{'__arm_streaming' only applies to function types}} - (void)sizeof(struct __arm_streaming S3 { int a __arm_streaming; }); // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \ - // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}} + (void)sizeof(int [4]ATTR_USE); // expected-error {{'ATTR_NAME' only applies to function types}} + (void)sizeof(struct ATTR_USE S3 { int a ATTR_USE; }); // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \ + // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}} - __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} - __arm_streaming asm (""); // expected-error {{'__arm_streaming' cannot appear here}} + ATTR_USE asm (""); // expected-error {{'ATTR_NAME' cannot appear here}} } -struct __arm_streaming S4 *s; // expected-error {{'__arm_streaming' cannot appear here}} +struct ATTR_USE S4 *s; // expected-error {{'ATTR_NAME' cannot appear here}} struct S5 {}; -int c = sizeof(struct __arm_streaming S5); // expected-error {{'__arm_streaming' cannot appear here}} +int c = sizeof(struct ATTR_USE S5); // expected-error {{'ATTR_NAME' cannot appear here}} + +void invalid_parentheses1() __arm_inout; // expected-error {{expected '(' after ''__arm_inout''}} +void invalid_parentheses2() __arm_inout(; // expected-error {{expected string literal as argument of '__arm_inout' attribute}} +void invalid_parentheses3() __arm_inout((); // expected-error {{expected string literal as argument of '__arm_inout' attribute}} +void invalid_parentheses4() __arm_inout); // expected-error {{expected '(' after ''__arm_inout''}} \ + // expected-error {{expected function body after function declarator}} +void invalid_parentheses5() __arm_inout(()); // expected-error {{expected string literal as argument of '__arm_inout' attribute}} +void invalid_parentheses6() __arm_inout("za"; // expected-error {{expected ')'}} +void invalid_parentheses7() __arm_streaming(; // expected-error {{expected parameter declarator}} \ + // expected-error {{expected ')'}} \ + // expected-note {{to match this '('}} \ + // expected-error {{function cannot return function type 'void ()'}} \ + // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \ + // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}} +void invalid_parentheses8() __arm_streaming(); // expected-error {{function cannot return function type 'void ()'}} \ + // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \ + // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}} diff --git a/clang/test/Parser/c2x-attribute-keywords.m b/clang/test/Parser/c2x-attribute-keywords.m index 2296be13cb71..575c88ffffc3 100644 --- a/clang/test/Parser/c2x-attribute-keywords.m +++ b/clang/test/Parser/c2x-attribute-keywords.m @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify %s -enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum __arm_inout("za") E1 : int; // expected-error {{'__arm_inout' only applies to non-K&R-style functions}} @interface Base @end @@ -15,5 +15,5 @@ enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' only applie void f(T *t) { - __arm_streaming[[t foo] bar]; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + __arm_inout("za")[[t foo] bar]; // expected-error {{'__arm_inout' cannot be applied to a statement}} } diff --git a/clang/test/Parser/cxx0x-keyword-attributes.cpp b/clang/test/Parser/cxx0x-keyword-attributes.cpp index 8d31efac5320..be7423cc7ece 100644 --- a/clang/test/Parser/cxx0x-keyword-attributes.cpp +++ b/clang/test/Parser/cxx0x-keyword-attributes.cpp @@ -1,4 +1,7 @@ -// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme %s +// RUN: sed -e "s@ATTR_USE@__arm_streaming@g" -e "s@ATTR_NAME@__arm_streaming@g" %s > %t +// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t +// RUN: sed -e "s@ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s@ATTR_NAME@__arm_inout@g" %s > %t +// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t // Need std::initializer_list namespace std { @@ -35,136 +38,136 @@ namespace std { // Declaration syntax checks -__arm_streaming int before_attr; // expected-error {{'__arm_streaming' only applies to function types}} -int __arm_streaming between_attr; // expected-error {{'__arm_streaming' only applies to function types}} -const __arm_streaming int between_attr_2 = 0; // expected-error {{'__arm_streaming' cannot appear here}} -int after_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} -int * __arm_streaming ptr_attr; // expected-error {{'__arm_streaming' only applies to function types}} -int & __arm_streaming ref_attr = after_attr; // expected-error {{'__arm_streaming' only applies to function types}} -int && __arm_streaming rref_attr = 0; // expected-error {{'__arm_streaming' only applies to function types}} -int array_attr [1] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} -void fn_attr () __arm_streaming; -void noexcept_fn_attr () noexcept __arm_streaming; +ATTR_USE int before_attr; // expected-error {{'ATTR_NAME' only applies to function types}} +int ATTR_USE between_attr; // expected-error {{'ATTR_NAME' only applies to function types}} +const ATTR_USE int between_attr_2 = 0; // expected-error {{'ATTR_NAME' cannot appear here}} +int after_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} +int * ATTR_USE ptr_attr; // expected-error {{'ATTR_NAME' only applies to function types}} +int & ATTR_USE ref_attr = after_attr; // expected-error {{'ATTR_NAME' only applies to function types}} +int && ATTR_USE rref_attr = 0; // expected-error {{'ATTR_NAME' only applies to function types}} +int array_attr [1] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} +void fn_attr () ATTR_USE; +void noexcept_fn_attr () noexcept ATTR_USE; struct MemberFnOrder { - virtual void f() const volatile && noexcept __arm_streaming final = 0; + virtual void f() const volatile && noexcept ATTR_USE final = 0; }; -struct __arm_streaming struct_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -class __arm_streaming class_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -union __arm_streaming union_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming E { }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +struct ATTR_USE struct_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +class ATTR_USE class_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +union ATTR_USE union_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE E { }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} namespace test_misplacement { -__arm_streaming struct struct_attr2; // expected-error {{misplaced '__arm_streaming'}} -__arm_streaming class class_attr2; // expected-error {{misplaced '__arm_streaming'}} -__arm_streaming union union_attr2; // expected-error {{misplaced '__arm_streaming'}} -__arm_streaming enum E2 { }; // expected-error {{misplaced '__arm_streaming'}} +ATTR_USE struct struct_attr2; // expected-error {{misplaced 'ATTR_NAME'}} +ATTR_USE class class_attr2; // expected-error {{misplaced 'ATTR_NAME'}} +ATTR_USE union union_attr2; // expected-error {{misplaced 'ATTR_NAME'}} +ATTR_USE enum E2 { }; // expected-error {{misplaced 'ATTR_NAME'}} } // Checks attributes placed at wrong syntactic locations of class specifiers. -class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} - attr_after_class_name_decl __arm_streaming __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} +class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} + attr_after_class_name_decl ATTR_USE ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} -class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} - attr_after_class_name_definition __arm_streaming __arm_streaming __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 3 {{'__arm_streaming' only applies to non-K&R-style functions}} +class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} + attr_after_class_name_definition ATTR_USE ATTR_USE ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error 3 {{'ATTR_NAME' only applies to non-K&R-style functions}} -class __arm_streaming c {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -class c __arm_streaming __arm_streaming x; // expected-error 2 {{'__arm_streaming' only applies to function types}} -class c __arm_streaming __arm_streaming y __arm_streaming __arm_streaming; // expected-error 4 {{'__arm_streaming' only applies to function types}} +class ATTR_USE c {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +class c ATTR_USE ATTR_USE x; // expected-error 2 {{'ATTR_NAME' only applies to function types}} +class c ATTR_USE ATTR_USE y ATTR_USE ATTR_USE; // expected-error 4 {{'ATTR_NAME' only applies to function types}} class c final [(int){0}]; class base {}; -class __arm_streaming __arm_streaming final_class // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} - __arm_streaming alignas(float) final // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - __arm_streaming alignas(float) __arm_streaming alignas(float): base{}; // expected-error {{'__arm_streaming' cannot appear here}} +class ATTR_USE ATTR_USE final_class // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} + ATTR_USE alignas(float) final // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + ATTR_USE alignas(float) ATTR_USE alignas(float): base{}; // expected-error {{'ATTR_NAME' cannot appear here}} -class __arm_streaming __arm_streaming final_class_another // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} - __arm_streaming __arm_streaming alignas(16) final // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} - __arm_streaming __arm_streaming alignas(16) __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} +class ATTR_USE ATTR_USE final_class_another // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} + ATTR_USE ATTR_USE alignas(16) final // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} + ATTR_USE ATTR_USE alignas(16) ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}} -class after_class_close {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "class" to apply it to the type declaration}} +class after_class_close {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "class" to apply it to the type declaration}} class C {}; -__arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}} -__arm_streaming struct no_init_declarators; // expected-error {{misplaced '__arm_streaming'}} -template __arm_streaming struct no_init_declarators_template; // expected-error {{'__arm_streaming' cannot appear here}} +ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}} +ATTR_USE struct no_init_declarators; // expected-error {{misplaced 'ATTR_NAME'}} +template ATTR_USE struct no_init_declarators_template; // expected-error {{'ATTR_NAME' cannot appear here}} void fn_with_structs() { - __arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}} - __arm_streaming struct no_init_declarators; // expected-error {{'__arm_streaming' cannot appear here}} + ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}} + ATTR_USE struct no_init_declarators; // expected-error {{'ATTR_NAME' cannot appear here}} } -__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} struct ctordtor { - __arm_streaming ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} - ctordtor (C) __arm_streaming; - __arm_streaming ~ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} + ATTR_USE ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}} + ctordtor (C) ATTR_USE; + ATTR_USE ~ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}} }; -__arm_streaming ctordtor::ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} -__arm_streaming ctordtor::ctordtor (C) __arm_streaming try {} catch (...) {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -__arm_streaming ctordtor::~ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}} -extern "C++" __arm_streaming int extern_attr; // expected-error {{'__arm_streaming' only applies to function types}} -template __arm_streaming void template_attr (); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} -__arm_streaming __arm_streaming int __arm_streaming __arm_streaming multi_attr __arm_streaming __arm_streaming; // expected-error 6 {{'__arm_streaming' only applies to function types}} - -int (paren_attr) __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}} -unsigned __arm_streaming int attr_in_decl_spec; // expected-error {{'__arm_streaming' cannot appear here}} -unsigned __arm_streaming int __arm_streaming const double_decl_spec = 0; // expected-error 2 {{'__arm_streaming' cannot appear here}} +ATTR_USE ctordtor::ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}} +ATTR_USE ctordtor::ctordtor (C) ATTR_USE try {} catch (...) {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} +ATTR_USE ctordtor::~ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}} +extern "C++" ATTR_USE int extern_attr; // expected-error {{'ATTR_NAME' only applies to function types}} +template ATTR_USE void template_attr (); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} +ATTR_USE ATTR_USE int ATTR_USE ATTR_USE multi_attr ATTR_USE ATTR_USE; // expected-error 6 {{'ATTR_NAME' only applies to function types}} + +int (paren_attr) ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}} +unsigned ATTR_USE int attr_in_decl_spec; // expected-error {{'ATTR_NAME' cannot appear here}} +unsigned ATTR_USE int ATTR_USE const double_decl_spec = 0; // expected-error 2 {{'ATTR_NAME' cannot appear here}} class foo { - void const_after_attr () __arm_streaming const; // expected-error {{expected ';'}} + void const_after_attr () ATTR_USE const; // expected-error {{expected ';'}} }; -extern "C++" __arm_streaming { } // expected-error {{'__arm_streaming' cannot appear here}} -__arm_streaming extern "C++" { } // expected-error {{'__arm_streaming' cannot appear here}} -__arm_streaming template void before_template_attr (); // expected-error {{'__arm_streaming' cannot appear here}} -__arm_streaming namespace ns { int i; } // expected-error {{'__arm_streaming' cannot appear here}} -__arm_streaming static_assert(true, ""); //expected-error {{'__arm_streaming' cannot appear here}} -__arm_streaming asm(""); // expected-error {{'__arm_streaming' cannot appear here}} - -__arm_streaming using ns::i; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -__arm_streaming using namespace ns; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -namespace __arm_streaming ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - -using __arm_streaming alignas(4)__arm_streaming ns::i; // expected-warning 2 {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot appear here}} \ +extern "C++" ATTR_USE { } // expected-error {{'ATTR_NAME' cannot appear here}} +ATTR_USE extern "C++" { } // expected-error {{'ATTR_NAME' cannot appear here}} +ATTR_USE template void before_template_attr (); // expected-error {{'ATTR_NAME' cannot appear here}} +ATTR_USE namespace ns { int i; } // expected-error {{'ATTR_NAME' cannot appear here}} +ATTR_USE static_assert(true, ""); //expected-error {{'ATTR_NAME' cannot appear here}} +ATTR_USE asm(""); // expected-error {{'ATTR_NAME' cannot appear here}} + +ATTR_USE using ns::i; // expected-warning {{ISO C++}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +ATTR_USE using namespace ns; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +namespace ATTR_USE ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + +using ATTR_USE alignas(4)ATTR_USE ns::i; // expected-warning 2 {{ISO C++}} \ + expected-error {{'ATTR_NAME' cannot appear here}} \ expected-error {{'alignas' attribute only applies to variables, data members and tag types}} \ expected-warning {{ISO C++}} \ - expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} -using __arm_streaming alignas(4) __arm_streaming foobar = int; // expected-error {{'__arm_streaming' cannot appear here}} \ + expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} +using ATTR_USE alignas(4) ATTR_USE foobar = int; // expected-error {{'ATTR_NAME' cannot appear here}} \ expected-error {{'alignas' attribute only applies to}} \ - expected-error 2 {{'__arm_streaming' only applies to function types}} - -__arm_streaming using T = int; // expected-error {{'__arm_streaming' cannot appear here}} -using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}} -template using U __arm_streaming = T; // expected-error {{'__arm_streaming' only applies to function types}} -using ns::i __arm_streaming; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -using ns::i __arm_streaming, ns::i __arm_streaming; // expected-warning 2 {{ISO C++}} \ + expected-error 2 {{'ATTR_NAME' only applies to function types}} + +ATTR_USE using T = int; // expected-error {{'ATTR_NAME' cannot appear here}} +using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}} +template using U ATTR_USE = T; // expected-error {{'ATTR_NAME' only applies to function types}} +using ns::i ATTR_USE; // expected-warning {{ISO C++}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +using ns::i ATTR_USE, ns::i ATTR_USE; // expected-warning 2 {{ISO C++}} \ expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \ - expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}} + expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}} struct using_in_struct_base { typedef int i, j, k, l; }; struct using_in_struct : using_in_struct_base { - __arm_streaming using using_in_struct_base::i; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - using using_in_struct_base::j __arm_streaming; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - __arm_streaming using using_in_struct_base::k __arm_streaming, using_in_struct_base::l __arm_streaming; // expected-warning 3 {{ISO C++}} \ + ATTR_USE using using_in_struct_base::i; // expected-warning {{ISO C++}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + using using_in_struct_base::j ATTR_USE; // expected-warning {{ISO C++}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + ATTR_USE using using_in_struct_base::k ATTR_USE, using_in_struct_base::l ATTR_USE; // expected-warning 3 {{ISO C++}} \ expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \ - expected-error 4 {{'__arm_streaming' only applies to non-K&R-style functions}} + expected-error 4 {{'ATTR_NAME' only applies to non-K&R-style functions}} }; -using __arm_streaming ns::i; // expected-warning {{ISO C++}} \ - expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}} +using ATTR_USE ns::i; // expected-warning {{ISO C++}} \ + expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}} -auto trailing() -> __arm_streaming const int; // expected-error {{'__arm_streaming' cannot appear here}} -auto trailing() -> const __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}} -auto trailing() -> const int __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} -auto trailing_2() -> struct struct_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}} +auto trailing() -> ATTR_USE const int; // expected-error {{'ATTR_NAME' cannot appear here}} +auto trailing() -> const ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}} +auto trailing() -> const int ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} +auto trailing_2() -> struct struct_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}} namespace N { struct S {}; @@ -172,88 +175,88 @@ namespace N { template struct Template {}; // FIXME: Improve this diagnostic -struct __arm_streaming N::S s; // expected-error {{'__arm_streaming' cannot appear here}} -struct __arm_streaming Template t; // expected-error {{'__arm_streaming' cannot appear here}} -struct __arm_streaming ::template Template u; // expected-error {{'__arm_streaming' cannot appear here}} -template struct __arm_streaming Template; // expected-error {{'__arm_streaming' cannot appear here}} +struct ATTR_USE N::S s; // expected-error {{'ATTR_NAME' cannot appear here}} +struct ATTR_USE Template t; // expected-error {{'ATTR_NAME' cannot appear here}} +struct ATTR_USE ::template Template u; // expected-error {{'ATTR_NAME' cannot appear here}} +template struct ATTR_USE Template; // expected-error {{'ATTR_NAME' cannot appear here}} template struct __attribute__((pure)) Template; // We still allow GNU-style attributes here -template <> struct __arm_streaming Template; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - -enum __arm_streaming E1 {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming E2; // expected-error {{forbids forward references}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming E1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming E3 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum __arm_streaming { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} - k_123 __arm_streaming = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \ - expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +template <> struct ATTR_USE Template; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + +enum ATTR_USE E1 {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE E2; // expected-error {{forbids forward references}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE E1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE E3 : int; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum ATTR_USE { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} + k_123 ATTR_USE = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \ + expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} }; -enum __arm_streaming E1 e; // expected-error {{'__arm_streaming' cannot appear here}} -enum __arm_streaming class E4 { }; // expected-error {{'__arm_streaming' cannot appear here}} -enum struct __arm_streaming E5; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} -enum E6 {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "enum" to apply it to the type declaration}} +enum ATTR_USE E1 e; // expected-error {{'ATTR_NAME' cannot appear here}} +enum ATTR_USE class E4 { }; // expected-error {{'ATTR_NAME' cannot appear here}} +enum struct ATTR_USE E5; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} +enum E6 {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "enum" to apply it to the type declaration}} struct S { - friend int f __arm_streaming (); // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} - friend int f2 __arm_streaming () {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}} - __arm_streaming friend int g(); // expected-error {{'__arm_streaming' cannot appear here}} - __arm_streaming friend int h() { // expected-error {{'__arm_streaming' cannot be applied to a declaration}} + friend int f ATTR_USE (); // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' cannot be applied to a declaration}} + friend int f2 ATTR_USE () {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} + ATTR_USE friend int g(); // expected-error {{'ATTR_NAME' cannot appear here}} + ATTR_USE friend int h() { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}} } - __arm_streaming friend int f3(), f4(), f5(); // expected-error {{'__arm_streaming' cannot appear here}} - friend int f6 __arm_streaming (), f7 __arm_streaming (), f8 __arm_streaming (); // expected-error3 {{'__arm_streaming' cannot appear here}} \ - expected-error 3 {{'__arm_streaming' cannot be applied to a declaration}} - friend class __arm_streaming C; // expected-error {{'__arm_streaming' cannot appear here}} - __arm_streaming friend class D; // expected-error {{'__arm_streaming' cannot appear here}} - __arm_streaming friend int; // expected-error {{'__arm_streaming' cannot appear here}} + ATTR_USE friend int f3(), f4(), f5(); // expected-error {{'ATTR_NAME' cannot appear here}} + friend int f6 ATTR_USE (), f7 ATTR_USE (), f8 ATTR_USE (); // expected-error3 {{'ATTR_NAME' cannot appear here}} \ + expected-error 3 {{'ATTR_NAME' cannot be applied to a declaration}} + friend class ATTR_USE C; // expected-error {{'ATTR_NAME' cannot appear here}} + ATTR_USE friend class D; // expected-error {{'ATTR_NAME' cannot appear here}} + ATTR_USE friend int; // expected-error {{'ATTR_NAME' cannot appear here}} }; template void tmpl (T) {} -template __arm_streaming void tmpl(char); // expected-error {{'__arm_streaming' cannot appear here}} -template void __arm_streaming tmpl(short); // expected-error {{'__arm_streaming' only applies to function types}} +template ATTR_USE void tmpl(char); // expected-error {{'ATTR_NAME' cannot appear here}} +template void ATTR_USE tmpl(short); // expected-error {{'ATTR_NAME' only applies to function types}} // Statement tests void foo () { - __arm_streaming ; // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming { } // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming if (0) { } // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming do { // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming continue; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE ; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE if (0) { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE do { // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE continue; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} } while (0); - __arm_streaming while (0); // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE while (0); // expected-error {{'ATTR_NAME' cannot be applied to a statement}} - __arm_streaming switch (i) { // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming case 0: // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming default: // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming break; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE switch (i) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE case 0: // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE default: // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} } - __arm_streaming goto there; // expected-error {{'__arm_streaming' cannot be applied to a statement}} - __arm_streaming there: // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} + ATTR_USE goto there; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + ATTR_USE there: // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} - __arm_streaming try { // expected-error {{'__arm_streaming' cannot be applied to a statement}} - } __arm_streaming catch (...) { // expected-error {{'__arm_streaming' cannot appear here}} + ATTR_USE try { // expected-error {{'ATTR_NAME' cannot be applied to a statement}} + } ATTR_USE catch (...) { // expected-error {{'ATTR_NAME' cannot appear here}} } - void bar __arm_streaming (__arm_streaming int i, __arm_streaming int j); // expected-error 2 {{'__arm_streaming' only applies to function types}} \ - expected-error {{'__arm_streaming' cannot be applied to a declaration}} - using FuncType = void (__arm_streaming int); // expected-error {{'__arm_streaming' only applies to function types}} - void baz(__arm_streaming...); // expected-error {{expected parameter declarator}} + void bar ATTR_USE (ATTR_USE int i, ATTR_USE int j); // expected-error 2 {{'ATTR_NAME' only applies to function types}} \ + expected-error {{'ATTR_NAME' cannot be applied to a declaration}} + using FuncType = void (ATTR_USE int); // expected-error {{'ATTR_NAME' only applies to function types}} + void baz(ATTR_USE...); // expected-error {{expected parameter declarator}} - __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}} + ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}} } // Expression tests void bar () { - new int[42]__arm_streaming[5]__arm_streaming{}; // expected-error {{'__arm_streaming' only applies to function types}} + new int[42]ATTR_USE[5]ATTR_USE{}; // expected-error {{'ATTR_NAME' only applies to function types}} } // Condition tests void baz () { - if (__arm_streaming bool b = true) { // expected-error {{'__arm_streaming' only applies to function types}} - switch (__arm_streaming int n { 42 }) { // expected-error {{'__arm_streaming' only applies to function types}} + if (ATTR_USE bool b = true) { // expected-error {{'ATTR_NAME' only applies to function types}} + switch (ATTR_USE int n { 42 }) { // expected-error {{'ATTR_NAME' only applies to function types}} default: - for (__arm_streaming int n = 0; __arm_streaming char b = n < 5; ++b) { // expected-error 2 {{'__arm_streaming' only applies to function types}} + for (ATTR_USE int n = 0; ATTR_USE char b = n < 5; ++b) { // expected-error 2 {{'ATTR_NAME' only applies to function types}} } } } @@ -261,37 +264,37 @@ void baz () { // An attribute can be applied to an expression-statement, such as the first // statement in a for. But it can't be applied to a condition which is an // expression. - for (__arm_streaming x = 0; ; ) {} // expected-error {{'__arm_streaming' cannot appear here}} - for (; __arm_streaming x < 5; ) {} // expected-error {{'__arm_streaming' cannot appear here}} - while (__arm_streaming bool k { false }) { // expected-error {{'__arm_streaming' only applies to function types}} + for (ATTR_USE x = 0; ; ) {} // expected-error {{'ATTR_NAME' cannot appear here}} + for (; ATTR_USE x < 5; ) {} // expected-error {{'ATTR_NAME' cannot appear here}} + while (ATTR_USE bool k { false }) { // expected-error {{'ATTR_NAME' only applies to function types}} } - while (__arm_streaming true) { // expected-error {{'__arm_streaming' cannot appear here}} + while (ATTR_USE true) { // expected-error {{'ATTR_NAME' cannot appear here}} } do { - } while (__arm_streaming false); // expected-error {{'__arm_streaming' cannot appear here}} + } while (ATTR_USE false); // expected-error {{'ATTR_NAME' cannot appear here}} - for (__arm_streaming int n : { 1, 2, 3 }) { // expected-error {{'__arm_streaming' only applies to function types}} + for (ATTR_USE int n : { 1, 2, 3 }) { // expected-error {{'ATTR_NAME' only applies to function types}} } } enum class __attribute__((visibility("hidden"))) SecretKeepers { one, /* rest are deprecated */ two, three }; -enum class __arm_streaming EvenMoreSecrets {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +enum class ATTR_USE EvenMoreSecrets {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} // Forbid attributes on decl specifiers. -unsigned __arm_streaming static int __arm_streaming v1; // expected-error {{'__arm_streaming' only applies to function types}} \ - expected-error {{'__arm_streaming' cannot appear here}} -typedef __arm_streaming unsigned long __arm_streaming v2; // expected-error {{'__arm_streaming' only applies to function types}} \ - expected-error {{'__arm_streaming' cannot appear here}} -int __arm_streaming foo(int __arm_streaming x); // expected-error 2 {{'__arm_streaming' only applies to function types}} +unsigned ATTR_USE static int ATTR_USE v1; // expected-error {{'ATTR_NAME' only applies to function types}} \ + expected-error {{'ATTR_NAME' cannot appear here}} +typedef ATTR_USE unsigned long ATTR_USE v2; // expected-error {{'ATTR_NAME' only applies to function types}} \ + expected-error {{'ATTR_NAME' cannot appear here}} +int ATTR_USE foo(int ATTR_USE x); // expected-error 2 {{'ATTR_NAME' only applies to function types}} -__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} +ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} class A { - A(__arm_streaming int a); // expected-error {{'__arm_streaming' only applies to function types}} + A(ATTR_USE int a); // expected-error {{'ATTR_NAME' only applies to function types}} }; -A::A(__arm_streaming int a) {} // expected-error {{'__arm_streaming' only applies to function types}} +A::A(ATTR_USE int a) {} // expected-error {{'ATTR_NAME' only applies to function types}} template struct TemplateStruct {}; class FriendClassesWithAttributes { @@ -299,47 +302,47 @@ class FriendClassesWithAttributes { template friend class __attribute__((__type_visibility__("default"))) vector; template friend class __declspec(code_seg("foo,whatever")) vector2; // But not C++11 ones - template friend class __arm_streaming vector3; // expected-error {{'__arm_streaming' cannot appear here}} + template friend class ATTR_USE vector3; // expected-error {{'ATTR_NAME' cannot appear here}} // Also allowed friend struct __attribute__((__type_visibility__("default"))) TemplateStruct; friend struct __declspec(code_seg("foo,whatever")) TemplateStruct; - friend struct __arm_streaming TemplateStruct; // expected-error {{'__arm_streaming' cannot appear here}} + friend struct ATTR_USE TemplateStruct; // expected-error {{'ATTR_NAME' cannot appear here}} }; // Check ordering: C++11 attributes must appear before GNU attributes. class Ordering { void f1( - int (__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}} + int (ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}} ) { } void f2( - int (*)(__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}} + int (*)(ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}} ) { } void f3( - int (__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}} + int (__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}} ) { } void f4( - int (*)(__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}} + int (*)(__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}} ) { } }; namespace base_specs { struct A {}; -struct B : __arm_streaming A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}} -struct C : __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}} -struct D : __arm_streaming public virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}} -struct E : public __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a base specifier}} -struct F : virtual __arm_streaming public A {}; // expected-error {{'__arm_streaming' cannot appear here}} \ - expected-error {{'__arm_streaming' cannot be applied to a base specifier}} +struct B : ATTR_USE A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}} +struct C : ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}} +struct D : ATTR_USE public virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}} +struct E : public ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' cannot be applied to a base specifier}} +struct F : virtual ATTR_USE public A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \ + expected-error {{'ATTR_NAME' cannot be applied to a base specifier}} } -namespace __arm_streaming ns_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \ +namespace ATTR_USE ns_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \ expected-warning {{attributes on a namespace declaration are a C++17 extension}} diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index e63d9f0a8475..476da8534ce7 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -23,7 +23,7 @@ int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible { return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); } -void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_shared_za { +void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_inout("za") { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr); } @@ -58,7 +58,7 @@ svuint32_t incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streami return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); } -void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_shared_za { +void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_inout("za") { // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} svmops_za32_f32_m(0, pn, pm, zn, zm); } diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp index b59d67f7f57b..0a54a94f408b 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp +++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp @@ -4,16 +4,16 @@ void streaming_compatible_def() __arm_streaming_compatible {} // OK void streaming_def() __arm_streaming { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} -void shared_za_def() __arm_shared_za { } // expected-error {{function using ZA state requires 'sme'}} -__arm_new_za void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}} +void shared_za_def() __arm_inout("za") { } // expected-error {{function using ZA state requires 'sme'}} +__arm_new("za") void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}} __arm_locally_streaming void locally_streaming_def() { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} -void streaming_shared_za_def() __arm_streaming __arm_shared_za { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} +void streaming_shared_za_def() __arm_streaming __arm_inout("za") { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}} // It should work fine when we explicitly add the target("sme") attribute. __attribute__((target("sme"))) void streaming_compatible_def_sme_attr() __arm_streaming_compatible {} // OK __attribute__((target("sme"))) void streaming_def_sme_attr() __arm_streaming { } // OK -__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_shared_za { } // OK -__arm_new_za __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK +__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_inout("za") { } // OK +__arm_new("za") __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK __arm_locally_streaming __attribute__((target("sme"))) void locally_streaming_def_sme_attr() {} // OK // Test that it also works with the target("sme2") attribute. @@ -22,7 +22,7 @@ __attribute__((target("sme2"))) void streaming_def_sme2_attr() __arm_streaming { // No code is generated for declarations, so it should be fine to declare using the attribute. void streaming_compatible_decl() __arm_streaming_compatible; // OK void streaming_decl() __arm_streaming; // OK -void shared_za_decl() __arm_shared_za; // OK +void shared_za_decl() __arm_inout("za"); // OK void non_streaming_decl(); void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming, diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c index 73c0934d689e..b986b0b3de2e 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs.c +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -6,27 +6,25 @@ void sme_arm_streaming(void) __arm_streaming; void sme_arm_streaming_compatible(void) __arm_streaming_compatible; -__arm_new_za void sme_arm_new_za(void) {} -void sme_arm_shared_za(void) __arm_shared_za; -void sme_arm_preserves_za(void) __arm_preserves_za; +__arm_new("za") void sme_arm_new_za(void) {} +void sme_arm_shared_za(void) __arm_inout("za"); +void sme_arm_preserves_za(void) __arm_preserves("za"); -__arm_new_za void sme_arm_streaming_new_za(void) __arm_streaming {} -void sme_arm_streaming_shared_za(void) __arm_streaming __arm_shared_za; -void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves_za; +__arm_new("za") void sme_arm_streaming_new_za(void) __arm_streaming {} +void sme_arm_streaming_shared_za(void) __arm_streaming __arm_inout("za"); +void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves("za"); -__arm_new_za void sme_arm_sc_new_za(void) __arm_streaming_compatible {} -void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_shared_za; -void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves_za; - -void sme_arm_shared_preserves_za(void) __arm_shared_za __arm_preserves_za; +__arm_new("za") void sme_arm_sc_new_za(void) __arm_streaming_compatible {} +void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_inout("za"); +void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves("za"); __arm_locally_streaming void sme_arm_locally_streaming(void) { } __arm_locally_streaming void sme_arm_streaming_and_locally_streaming(void) __arm_streaming { } __arm_locally_streaming void sme_arm_streaming_and_streaming_compatible(void) __arm_streaming_compatible { } -__arm_locally_streaming __arm_new_za void sme_arm_ls_new_za(void) { } -__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_shared_za { } -__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves_za { } +__arm_locally_streaming __arm_new("za") void sme_arm_ls_new_za(void) { } +__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_inout("za") { } +__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves("za") { } // Valid attributes on function pointers @@ -38,18 +36,14 @@ void streaming_compatible_ptr(void) __arm_streaming_compatible; typedef void (*fptrty2) (void) __arm_streaming_compatible; fptrty2 call_sc_func() { return streaming_compatible_ptr; } -void shared_za_ptr(void) __arm_shared_za; -typedef void (*fptrty3) (void) __arm_shared_za; +void shared_za_ptr(void) __arm_inout("za"); +typedef void (*fptrty3) (void) __arm_inout("za"); fptrty3 call_shared_za_func() { return shared_za_ptr; } -void preserves_za_ptr(void) __arm_preserves_za; -typedef void (*fptrty4) (void) __arm_preserves_za; +void preserves_za_ptr(void) __arm_preserves("za"); +typedef void (*fptrty4) (void) __arm_preserves("za"); fptrty4 call_preserve_za_func() { return preserves_za_ptr; } -void shared_preserves_za_ptr(void) __arm_shared_za __arm_preserves_za; -typedef void (*fptrty5) (void) __arm_shared_za __arm_preserves_za; -fptrty5 call_shared_preserve_za_func() { return shared_preserves_za_ptr; } - typedef void (*fptrty6) (void); fptrty6 cast_nza_func_to_normal() { return sme_arm_new_za; } fptrty6 cast_ls_func_to_normal() { return sme_arm_locally_streaming; } @@ -68,13 +62,13 @@ void streaming_mode(void) __arm_streaming __arm_streaming_compatible; // expected-note@+1 {{conflicting attribute is here}} void streaming_compatible(void) __arm_streaming_compatible __arm_streaming; -// expected-cpp-error@+2 {{'__arm_new_za' and '__arm_shared_za' are not compatible}} -// expected-error@+1 {{'__arm_new_za' and '__arm_shared_za' are not compatible}} -__arm_new_za void new_shared_za(void) __arm_shared_za {} +// expected-cpp-error@+2 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}} +// expected-error@+1 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}} +__arm_new("za") void new_shared_za(void) __arm_inout("za") {} -// expected-cpp-error@+2 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}} -// expected-error@+1 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}} -__arm_new_za void new_preserves_za(void) __arm_preserves_za {} +// expected-cpp-error@+2 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}} +// expected-error@+1 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}} +__arm_new("za") void new_preserves_za(void) __arm_preserves("za") {} // Invalid attributes on function pointers @@ -125,24 +119,25 @@ sc_ptrty return_invalid_fptr_streaming_compatible_normal(n_ptrty f) { return f; // expected-error@+1 {{incompatible function pointer types returning 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} n_ptrty return_invalid_fptr_normal_streaming_compatible(sc_ptrty f) { return f; } -typedef void (*sz_ptrty) (void) __arm_shared_za; +typedef void (*sz_ptrty) (void) __arm_inout("za"); sz_ptrty return_valid_shared_za_fptr(sz_ptrty f) { return f; } -// expected-cpp-error@+2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} -// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za')}} +// expected-cpp-error@+2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")')}} sz_ptrty return_invalid_fptr_shared_za_normal(n_ptrty f) { return f; } -// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za')}} -// expected-error@+1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")')}} +// expected-error@+1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} n_ptrty return_invalid_fptr_normal_shared_za(sz_ptrty f) { return f; } -typedef void (*pz_ptrty) (void) __arm_preserves_za; +typedef void (*pz_ptrty) (void) __arm_preserves("za"); pz_ptrty return_valid_preserves_za_fptr(pz_ptrty f) { return f; } -// expected-cpp-error@+2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} -// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves_za')}} +// expected-cpp-error@+2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}} +// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")')}} pz_ptrty return_invalid_fptr_preserves_za_normal(n_ptrty f) { return f; } -// No diagnostics, the preserves_za hint should be dropped silently. +// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")')}} +// expected-error@+1 {{incompatible function pointer types returning 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}} n_ptrty return_invalid_fptr_normal_preserves_za(pz_ptrty f) { return f; } // Test template instantiations @@ -164,21 +159,21 @@ template short templated(short); void redecl(void) __arm_streaming; void redecl(void) __arm_streaming_compatible { } -// expected-error@+5 {{function declared 'void (void) __arm_shared_za' was previously declared 'void (void) __arm_shared_za __arm_preserves_za', which has different SME function attributes}} +// expected-error@+5 {{function declared 'void (void)' was previously declared 'void (void) __arm_preserves("za")', which has different SME function attributes}} // expected-note@+3 {{previous declaration is here}} -// expected-cpp-error@+3 {{function declared 'void () __arm_shared_za' was previously declared 'void () __arm_shared_za __arm_preserves_za', which has different SME function attributes}} +// expected-cpp-error@+3 {{function declared 'void ()' was previously declared 'void () __arm_preserves("za")', which has different SME function attributes}} // expected-cpp-note@+1 {{previous declaration is here}} -void redecl_preserve_za(void) __arm_shared_za __arm_preserves_za;; -void redecl_preserve_za(void) __arm_shared_za {} +void redecl_preserve_za(void) __arm_preserves("za");; +void redecl_preserve_za(void) {} -// expected-error@+5 {{function declared 'void (void) __arm_shared_za __arm_preserves_za' was previously declared 'void (void) __arm_shared_za', which has different SME function attributes}} +// expected-error@+5 {{function declared 'void (void) __arm_preserves("za")' was previously declared 'void (void)', which has different SME function attributes}} // expected-note@+3 {{previous declaration is here}} -// expected-cpp-error@+3 {{function declared 'void () __arm_shared_za __arm_preserves_za' was previously declared 'void () __arm_shared_za', which has different SME function attributes}} +// expected-cpp-error@+3 {{function declared 'void () __arm_preserves("za")' was previously declared 'void ()', which has different SME function attributes}} // expected-cpp-note@+1 {{previous declaration is here}} -void redecl_nopreserve_za(void) __arm_shared_za; -void redecl_nopreserve_za(void) __arm_shared_za __arm_preserves_za {} +void redecl_nopreserve_za(void); +void redecl_nopreserve_za(void) __arm_preserves("za") {} -void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) { +void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za"), void (*preserves_za_fn_ptr)(void) __arm_preserves("za")) { sme_arm_new_za(); // OK // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}} // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} @@ -186,43 +181,46 @@ void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) { // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}} // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} shared_za_fn_ptr(); + // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}} + // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} + preserves_za_fn_ptr(); } -void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) __arm_shared_za { +void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) __arm_inout("za") { sme_arm_shared_za(); // OK shared_za_fn_ptr(); // OK } -__arm_new_za void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) { +__arm_new("za") void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) { sme_arm_shared_za(); // OK shared_za_fn_ptr(); // OK } #ifdef __cplusplus -int shared_za_initializer(void) __arm_shared_za; +int shared_za_initializer(void) __arm_inout("za"); // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}} int global = shared_za_initializer(); struct S { - virtual void shared_za_memberfn(void) __arm_shared_za; + virtual void shared_za_memberfn(void) __arm_inout("za"); }; struct S2 : public S { -// expected-cpp-error@+2 {{virtual function 'shared_za_memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_shared_za')}} +// expected-cpp-error@+2 {{virtual function 'shared_za_memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_inout("za")')}} // expected-cpp-note@-5 {{overridden virtual function is here}} - __arm_new_za void shared_za_memberfn(void) override {} + __arm_new("za") void shared_za_memberfn(void) override {} }; -// The '__arm_preserves_za' property cannot be dropped when overriding a virtual -// function. It is however fine for the overriding function to be '__arm_preserves_za' +// The '__arm_preserves("za")' property cannot be dropped when overriding a virtual +// function. It is however fine for the overriding function to be '__arm_preserves("za")' // even though the function that it overrides is not. struct S_PreservesZA { - virtual void memberfn(void) __arm_preserves_za; + virtual void memberfn(void) __arm_preserves("za"); }; struct S_Drop_PreservesZA : S_PreservesZA { -// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves_za')}} +// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves("za")')}} // expected-cpp-note@-5 {{overridden virtual function is here}} void memberfn(void) override {} }; @@ -230,9 +228,11 @@ struct S_Drop_PreservesZA : S_PreservesZA { struct S_NoPreservesZA { virtual void memberfn(void); }; + struct S_AddPreservesZA : S_NoPreservesZA { -// This is fine, the overridden function just adds more guarantees. - void memberfn(void) __arm_preserves_za override {} +// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void () __arm_preserves("za")') than the function it overrides (which has 'void ()')}} +// expected-cpp-note@-5 {{overridden virtual function is here}} + void memberfn(void) __arm_preserves("za") override {} }; @@ -258,20 +258,20 @@ struct S3 { }; template <> -struct S3 { +struct S3 { static constexpr int value = 8; }; template <> -struct S3 { +struct S3 { static constexpr int value = 16; }; void normal_func(void) {} void streaming_func(void) __arm_streaming {} void streaming_compatible_func(void) __arm_streaming_compatible {} -void shared_za_func(void) __arm_shared_za {} -void preserves_za_func(void) __arm_preserves_za {} +void shared_za_func(void) __arm_inout("za") {} +void preserves_za_func(void) __arm_preserves("za") {} static_assert(S3::value == 1, "why are we picking the wrong specialization?"); static_assert(S3::value == 2, "why are we picking the wrong specialization?"); @@ -295,8 +295,8 @@ template int test_templated_f(T); template<> constexpr int test_templated_f(void(*)(void)) { return 1; } template<> constexpr int test_templated_f(void(*)(void)__arm_streaming) { return 2; } template<> constexpr int test_templated_f(void(*)(void)__arm_streaming_compatible) { return 4; } -template<> constexpr int test_templated_f(void(*)(void)__arm_shared_za) { return 8; } -template<> constexpr int test_templated_f(void(*)(void)__arm_preserves_za) { return 16; } +template<> constexpr int test_templated_f(void(*)(void)__arm_inout("za")) { return 8; } +template<> constexpr int test_templated_f(void(*)(void)__arm_preserves("za")) { return 16; } static_assert(test_templated_f(&normal_func) == 1, "Instantiated to wrong function"); static_assert(test_templated_f(&streaming_func) == 2, "Instantiated to wrong function"); @@ -312,8 +312,8 @@ int invalid_type_for_attribute __arm_streaming; constexpr int overload(void f(void)) { return 1; } constexpr int overload(void f(void) __arm_streaming) { return 2; } constexpr int overload(void f(void) __arm_streaming_compatible) { return 4; } -constexpr int overload(void f(void) __arm_shared_za) { return 8; } -constexpr int overload(void f(void) __arm_preserves_za) { return 16; } +constexpr int overload(void f(void) __arm_inout("za")) { return 8; } +constexpr int overload(void f(void) __arm_preserves("za")) { return 16; } static_assert(overload(&normal_func) == 1, "Overloaded to wrong function"); static_assert(overload(&streaming_func) == 2, "Overloaded to wrong function"); static_assert(overload(&streaming_compatible_func) == 4, "Overloaded to wrong function"); @@ -330,3 +330,73 @@ constexpr X *ptr = 0; static_assert(overload_int(ptr->foo) == 2, "Overloaded to the wrong function after implicit instantiation"); #endif // ifdef __cplusplus + +// expected-cpp-error@+2 {{unknown state ''}} +// expected-error@+1 {{unknown state ''}} +__arm_new("") void invalid_arm_new_empty_string(void); +// expected-cpp-error@+2 {{expected string literal as argument of '__arm_new' attribute}} +// expected-error@+1 {{expected string literal as argument of '__arm_new' attribute}} +__arm_new(0) void invalid_arm_new_non_literal_string(void); +// expected-cpp-error@+2 {{unknown state 'unknownstate'}} +// expected-error@+1 {{unknown state 'unknownstate'}} +__arm_new("unknownstate") void invalid_arm_new_unknown_state(void); + +// expected-cpp-error@+2 {{unknown state ''}} +// expected-error@+1 {{unknown state ''}} +void invalid_arm_in_empty_string(void) __arm_in(""); +// expected-cpp-error@+2 {{expected string literal as argument of '__arm_in' attribute}} +// expected-error@+1 {{expected string literal as argument of '__arm_in' attribute}} +void invalid_arm_in_non_literal_string(void) __arm_in(0); +// expected-cpp-error@+2 {{unknown state 'unknownstate'}} +// expected-error@+1 {{unknown state 'unknownstate'}} +void invalid_arm_in_unknown_state(void) __arm_in("unknownstate"); + +void valid_state_attrs_in_in1(void) __arm_in("za"); +void valid_state_attrs_in_in2(void) __arm_in("za", "za"); + +// expected-cpp-error@+2 {{missing state for '__arm_in'}} +// expected-error@+1 {{missing state for '__arm_in'}} +void invalid_state_attrs_no_arg1(void) __arm_in(); +// expected-cpp-error@+2 {{missing state for '__arm_new'}} +// expected-error@+1 {{missing state for '__arm_new'}} +__arm_new() void invalid_state_attrs_no_arg2(void); + +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_in_out(void) __arm_in("za") __arm_out("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_in_inout(void) __arm_in("za") __arm_inout("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_in_preserves(void) __arm_in("za") __arm_preserves("za"); + +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_out_in(void) __arm_out("za") __arm_in("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_out_inout(void) __arm_out("za") __arm_inout("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_out_preserves(void) __arm_out("za") __arm_preserves("za"); + +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_inout_in(void) __arm_inout("za") __arm_in("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_inout_out(void) __arm_inout("za") __arm_out("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_inout_preserves(void) __arm_inout("za") __arm_preserves("za"); + +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_preserves_in(void) __arm_preserves("za") __arm_in("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_preserves_out(void) __arm_preserves("za") __arm_out("za"); +// expected-cpp-error@+2 {{conflicting attributes for state 'za'}} +// expected-error@+1 {{conflicting attributes for state 'za'}} +void conflicting_state_attrs_preserves_inout(void) __arm_preserves("za") __arm_inout("za"); diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 529d0d2d1e62..40254a5a0eaf 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -12,7 +12,7 @@ #include -void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} @@ -32,7 +32,7 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8()); } -void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} @@ -52,7 +52,7 @@ void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16()); } -void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} @@ -90,7 +90,7 @@ void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8()); } -void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} @@ -133,7 +133,7 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64()); } -void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr); // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} @@ -153,14 +153,14 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __a SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8()); } -void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(256); // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(-1); } -void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}} SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c index 95bb6be2d2d3..f1e858f81960 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -6,21 +6,21 @@ #include __attribute__((target("sme"))) -void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { svld1_hor_za8(0, 0, pg, ptr); } __attribute__((target("arch=armv8-a+sme"))) -void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { svld1_hor_vnum_za32(0, 0, pg, ptr, 0); } __attribute__((target("+sme"))) -void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za { +void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { svst1_ver_za16(0, 0, pg, ptr); } __attribute__((target("+sme"))) -void undefined(svbool_t pg, void *ptr) __arm_shared_za { +void undefined(svbool_t pg, void *ptr) __arm_inout("za") { svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}} } diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 529db64cf5cc..30790bc45ce3 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3459,23 +3459,27 @@ static void GenerateHasAttrSpellingStringSwitch( OS << " .Default(0);\n"; } -// Emits the list of tokens for regular keyword attributes. -void EmitClangAttrTokenKinds(RecordKeeper &Records, raw_ostream &OS) { - emitSourceFileHeader("A list of tokens generated from the attribute" - " definitions", - OS); +// Emits list of regular keyword attributes with info about their arguments. +void EmitClangRegularKeywordAttributeInfo(RecordKeeper &Records, + raw_ostream &OS) { + emitSourceFileHeader( + "A list of regular keyword attributes generated from the attribute" + " definitions", + OS); // Assume for now that the same token is not used in multiple regular // keyword attributes. for (auto *R : Records.getAllDerivedDefinitions("Attr")) - for (const auto &S : GetFlattenedSpellings(*R)) - if (isRegularKeywordAttribute(S)) { - if (!R->getValueAsListOfDefs("Args").empty()) - PrintError(R->getLoc(), - "RegularKeyword attributes with arguments are not " - "yet supported"); - OS << "KEYWORD_ATTRIBUTE(" - << S.getSpellingRecord().getValueAsString("Name") << ", )\n"; - } + for (const auto &S : GetFlattenedSpellings(*R)) { + if (!isRegularKeywordAttribute(S)) + continue; + std::vector Args = R->getValueAsListOfDefs("Args"); + bool HasArgs = llvm::any_of( + Args, [](const Record *Arg) { return !Arg->getValueAsBit("Fake"); }); + + OS << "KEYWORD_ATTRIBUTE(" + << S.getSpellingRecord().getValueAsString("Name") << ", " + << (HasArgs ? "true" : "false") << ",)\n"; + } OS << "#undef KEYWORD_ATTRIBUTE\n"; } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index c239b9d300df..a628e4e18be5 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1541,7 +1541,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "}\n\n"; OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) " - "__arm_streaming_compatible __arm_shared_za " + "__arm_streaming_compatible __arm_out(\"za\") " "{ }\n\n"; createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME); diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 4ff7b7d43aab..558c901be53b 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -35,7 +35,7 @@ enum ActionType { GenClangAttrSubjectMatchRuleList, GenClangAttrPCHRead, GenClangAttrPCHWrite, - GenClangAttrTokenKinds, + GenClangRegularKeywordAttributeInfo, GenClangAttrHasAttributeImpl, GenClangAttrSpellingListIndex, GenClangAttrASTVisitor, @@ -139,8 +139,10 @@ cl::opt Action( "Generate clang PCH attribute reader"), clEnumValN(GenClangAttrPCHWrite, "gen-clang-attr-pch-write", "Generate clang PCH attribute writer"), - clEnumValN(GenClangAttrTokenKinds, "gen-clang-attr-token-kinds", - "Generate a list of attribute-related clang tokens"), + clEnumValN(GenClangRegularKeywordAttributeInfo, + "gen-clang-regular-keyword-attr-info", + "Generate a list of regular keyword attributes with info " + "about their arguments"), clEnumValN(GenClangAttrHasAttributeImpl, "gen-clang-attr-has-attribute-impl", "Generate a clang attribute spelling list"), @@ -278,11 +280,14 @@ cl::opt Action( "Generate riscv_vector_builtin_cg.inc for clang"), clEnumValN(GenRISCVVectorBuiltinSema, "gen-riscv-vector-builtin-sema", "Generate riscv_vector_builtin_sema.inc for clang"), - clEnumValN(GenRISCVSiFiveVectorBuiltins, "gen-riscv-sifive-vector-builtins", + clEnumValN(GenRISCVSiFiveVectorBuiltins, + "gen-riscv-sifive-vector-builtins", "Generate riscv_sifive_vector_builtins.inc for clang"), - clEnumValN(GenRISCVSiFiveVectorBuiltinCG, "gen-riscv-sifive-vector-builtin-codegen", + clEnumValN(GenRISCVSiFiveVectorBuiltinCG, + "gen-riscv-sifive-vector-builtin-codegen", "Generate riscv_sifive_vector_builtin_cg.inc for clang"), - clEnumValN(GenRISCVSiFiveVectorBuiltinSema, "gen-riscv-sifive-vector-builtin-sema", + clEnumValN(GenRISCVSiFiveVectorBuiltinSema, + "gen-riscv-sifive-vector-builtin-sema", "Generate riscv_sifive_vector_builtin_sema.inc for clang"), clEnumValN(GenAttrDocs, "gen-attr-docs", "Generate attribute documentation"), @@ -336,8 +341,8 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenClangAttrPCHWrite: EmitClangAttrPCHWrite(Records, OS); break; - case GenClangAttrTokenKinds: - EmitClangAttrTokenKinds(Records, OS); + case GenClangRegularKeywordAttributeInfo: + EmitClangRegularKeywordAttributeInfo(Records, OS); break; case GenClangAttrHasAttributeImpl: EmitClangAttrHasAttrImpl(Records, OS); diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index ce552ebe73db..8b8e2668efa3 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -43,8 +43,8 @@ void EmitClangAttrSubjectMatchRuleList(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrPCHRead(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrPCHWrite(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangAttrTokenKinds(llvm::RecordKeeper &Records, - llvm::raw_ostream &OS); +void EmitClangRegularKeywordAttributeInfo(llvm::RecordKeeper &Records, + llvm::raw_ostream &OS); void EmitClangAttrHasAttrImpl(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrSpellingListIndex(llvm::RecordKeeper &Records, -- Gitee From ac5cd013edbd6d42877bb44fe9f27403ae6238d6 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 16 Jan 2024 10:37:27 +0000 Subject: [PATCH 54/77] [Clang][AArch64] Remove unnecessary and incorrect attributes from arm_sme.h. These attributes were using the GNU attribute syntax, rather than the new keyword attribute syntax, and they are no longer required as we have code in SemaChecking to verify whether a builtin is compatible with its caller. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/utils/TableGen/SveEmitter.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index a628e4e18be5..3828ad27b117 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1005,17 +1005,6 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, std::string FullName = mangleName(ClassS); std::string ProtoName = mangleName(getClassKind()); - std::string SMEAttrs = ""; - - if (Flags & Emitter.getEnumValueForFlag("IsStreaming")) - SMEAttrs += ", arm_streaming"; - if (Flags & Emitter.getEnumValueForFlag("IsStreamingCompatible")) - SMEAttrs += ", arm_streaming_compatible"; - if (Flags & Emitter.getEnumValueForFlag("IsSharedZA")) - SMEAttrs += ", arm_shared_za"; - if (Flags & Emitter.getEnumValueForFlag("IsPreservesZA")) - SMEAttrs += ", arm_preserves_za"; - OS << (IsOverloaded ? "__aio " : "__ai ") << "__attribute__((__clang_arm_builtin_alias("; @@ -1028,8 +1017,6 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, break; } - if (!SMEAttrs.empty()) - OS << SMEAttrs; OS << "))\n"; OS << getTypes()[0].str() << " " << ProtoName << "("; -- Gitee From 53c3274e538bccc670db72600ae4c29fddcb66cf Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Thu, 18 Jan 2024 09:17:23 +0000 Subject: [PATCH 55/77] [AArch64][SME] Conditionally do smstart/smstop (#77113) This patch adds conditional enabling/disabling of streaming mode for functions which have both the aarch64_pstate_sm_compatible and aarch64_pstate_sm_body attributes. This combination allows callees to determine if switching streaming mode is required instead of relying on the caller. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 50 ++++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 8 +- .../AArch64/AArch64MachineFunctionInfo.h | 7 + ...ing-body-streaming-compatible-interface.ll | 124 ++++++++++++++++++ 4 files changed, 167 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c814096a304f..d3901659d7d5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4761,17 +4761,9 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); } -SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, - SMEAttrs Attrs, SDLoc DL, - EVT VT) const { - if (Attrs.hasStreamingInterfaceOrBody()) - return DAG.getConstant(1, DL, VT); - - if (Attrs.hasNonStreamingInterfaceAndBody()) - return DAG.getConstant(0, DL, VT); - - assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface"); - +SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG, + SDValue Chain, SDLoc DL, + EVT VT) const { SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", getPointerTy(DAG.getDataLayout())); Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); @@ -6669,9 +6661,18 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // make sure it is Glued to the last CopyFromReg value. if (IsLocallyStreaming) { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); - Chain = - changeStreamingMode(DAG, DL, /*Enable*/ true, DAG.getRoot(), Glue, - DAG.getConstant(0, DL, MVT::i64), /*Entry*/ true); + SDValue PStateSM; + if (Attrs.hasStreamingCompatibleInterface()) { + PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64); + Register Reg = MF.getRegInfo().createVirtualRegister( + getRegClassFor(PStateSM.getValueType().getSimpleVT())); + FuncInfo->setPStateSMReg(Reg); + Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM); + } else { + PStateSM = DAG.getConstant(0, DL, MVT::i64); + } + Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM, + /*Entry*/ true); // Ensure that the SMSTART happens after the CopyWithChain such that its // chain result is used. @@ -7429,7 +7430,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, std::optional RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); if (RequiresSMChange) { - PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); + if (CallerAttrs.hasStreamingInterfaceOrBody()) + PStateSM = DAG.getConstant(1, DL, MVT::i64); + else if (CallerAttrs.hasNonStreamingInterface()) + PStateSM = DAG.getConstant(0, DL, MVT::i64); + else + PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64); OptimizationRemarkEmitter ORE(&MF.getFunction()); ORE.emit([&]() { auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition", @@ -7982,9 +7988,17 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Emit SMSTOP before returning from a locally streaming function SMEAttrs FuncAttrs(MF.getFunction()); if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { - Chain = changeStreamingMode( - DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), - DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); + if (FuncAttrs.hasStreamingCompatibleInterface()) { + Register Reg = FuncInfo->getPStateSMReg(); + assert(Reg.isValid() && "PStateSM Register is invalid"); + SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64); + Chain = + changeStreamingMode(DAG, DL, /*Enable*/ false, Chain, + /*Glue*/ SDValue(), PStateSM, /*Entry*/ false); + } else + Chain = changeStreamingMode( + DAG, DL, /*Enable*/ false, Chain, + /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); Glue = Chain.getValue(1); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index fc682484de54..060c939f7017 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1238,10 +1238,10 @@ private: // This function does not handle predicate bitcasts. SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; - // Returns the runtime value for PSTATE.SM. When the function is streaming- - // compatible, this generates a call to __arm_sme_state. - SDValue getPStateSM(SelectionDAG &DAG, SDValue Chain, SMEAttrs Attrs, - SDLoc DL, EVT VT) const; + // Returns the runtime value for PSTATE.SM by generating a call to + // __arm_sme_state. + SDValue getRuntimePStateSM(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + EVT VT) const; bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, LLT Ty2) const override; diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 8df95ff1e6ea..7d841cdc22df 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -194,6 +194,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// True if the function need asynchronous unwind information. mutable std::optional NeedsAsyncDwarfUnwindInfo; + // Holds a register containing pstate.sm. This is set + // on function entry to record the initial pstate of a function. + Register PStateSMReg = MCRegister::NoRegister; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -202,6 +206,9 @@ public: const DenseMap &Src2DstMBB) const override; + Register getPStateSMReg() const { return PStateSMReg; }; + void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; + bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll new file mode 100644 index 000000000000..d67573384ca9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: sm_body_sm_compatible_simple: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: tbnz w8, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: tbnz w8, #0, .LBB0_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: fmov s0, wzr +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + ret float zeroinitializer +} + +define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbnz w19, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: tbnz w19, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @normal_callee() + ret void +} + +; Function Attrs: nounwind uwtable vscale_range(1,16) +define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbnz w19, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB2_2: // %entry +; CHECK-NEXT: cbz w8, .LBB2_6 +; CHECK-NEXT: // %bb.3: // %if.else +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: tbnz w19, #0, .LBB2_5 +; CHECK-NEXT: // %bb.4: // %if.else +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB2_5: // %if.else +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_6: // %if.then +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: tbnz w19, #0, .LBB2_8 +; CHECK-NEXT: // %bb.7: // %if.then +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB2_8: // %if.then +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + tail call void @normal_callee() + br label %return + +if.else: ; preds = %entry + tail call void @streaming_compatible_callee() + br label %return + +return: ; preds = %if.else, %if.then + ret void +} -- Gitee From 196e8f3cfd2832e6bc06aa8f764cb02314eb1a5d Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 18 Jan 2024 09:51:34 +0000 Subject: [PATCH 56/77] [Clang][SME] Add missing IsStreamingCompatible flag to svget, svcreate & svset (#78430) Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sve.td | 36 +++++++++---------- .../acle_sve_create2-bfloat.c | 9 ++++- .../aarch64-sve-intrinsics/acle_sve_create2.c | 29 +++++++++------ .../acle_sve_create3-bfloat.c | 9 ++++- .../aarch64-sve-intrinsics/acle_sve_create3.c | 29 +++++++++------ .../acle_sve_create4-bfloat.c | 9 ++++- .../aarch64-sve-intrinsics/acle_sve_create4.c | 29 +++++++++------ .../acle_sve_get2-bfloat.c | 11 ++++-- .../aarch64-sve-intrinsics/acle_sve_get2.c | 29 +++++++++------ .../acle_sve_get3-bfloat.c | 13 +++++-- .../aarch64-sve-intrinsics/acle_sve_get3.c | 30 ++++++++++------ .../acle_sve_get4-bfloat.c | 15 +++++--- .../aarch64-sve-intrinsics/acle_sve_get4.c | 29 +++++++++------ .../acle_sve_set2-bfloat.c | 11 ++++-- .../aarch64-sve-intrinsics/acle_sve_set2.c | 29 +++++++++------ .../acle_sve_set3-bfloat.c | 12 +++++-- .../aarch64-sve-intrinsics/acle_sve_set3.c | 28 +++++++++------ .../acle_sve_set4-bfloat.c | 14 +++++--- .../aarch64-sve-intrinsics/acle_sve_set4.c | 28 +++++++++------ 19 files changed, 261 insertions(+), 138 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 758a4fe84dae..ef013977ffdc 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1217,9 +1217,9 @@ def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [I def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; -def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; +def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; @@ -1227,29 +1227,29 @@ def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, I def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; -def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>; +def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Vector insertion and extraction -def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; +def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; let TargetGuard = "sve,bf16" in { -def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; +def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; } //////////////////////////////////////////////////////////////////////////////// diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c index 2605d574b830..dec7e1297ebf 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate2_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[X0:%.*]], i64 0) @@ -27,7 +34,7 @@ // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP0]], [[X1:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP1]] // -svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1) +svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_bf16,,)(x0, x1); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c index 08e4280064a6..65d4e6118485 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate2_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[X0:%.*]], i64 0) @@ -27,7 +34,7 @@ // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP0]], [[X1:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP1]] // -svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1) +svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_s8,,)(x0, x1); } @@ -44,7 +51,7 @@ svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP0]], [[X1:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP1]] // -svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1) +svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_s16,,)(x0, x1); } @@ -61,7 +68,7 @@ svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP0]], [[X1:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP1]] // -svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1) +svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_s32,,)(x0, x1); } @@ -78,7 +85,7 @@ svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP0]], [[X1:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP1]] // -svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1) +svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_s64,,)(x0, x1); } @@ -95,7 +102,7 @@ svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP0]], [[X1:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP1]] // -svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1) +svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_u8,,)(x0, x1); } @@ -112,7 +119,7 @@ svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP0]], [[X1:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP1]] // -svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1) +svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_u16,,)(x0, x1); } @@ -129,7 +136,7 @@ svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP0]], [[X1:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP1]] // -svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1) +svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_u32,,)(x0, x1); } @@ -146,7 +153,7 @@ svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP0]], [[X1:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP1]] // -svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1) +svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_u64,,)(x0, x1); } @@ -163,7 +170,7 @@ svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP0]], [[X1:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP1]] // -svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1) +svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_f16,,)(x0, x1); } @@ -180,7 +187,7 @@ svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP0]], [[X1:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP1]] // -svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1) +svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_f32,,)(x0, x1); } @@ -197,7 +204,7 @@ svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP0]], [[X1:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP1]] // -svfloat64x2_t test_svcreate2_f64(svfloat64_t x0, svfloat64_t x1) +svfloat64x2_t test_svcreate2_f64(svfloat64_t x0, svfloat64_t x1) ATTR { return SVE_ACLE_FUNC(svcreate2,_f64,,)(x0, x1); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c index 760708cfb7a0..d8c22cfb88a0 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate3_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( poison, [[X0:%.*]], i64 0) @@ -29,7 +36,7 @@ // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP1]], [[X2:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP2]] // -svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2) +svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_bf16,,)(x0, x1, x2); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c index 325997ab6f91..9f485d6d3a6c 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate3_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[X0:%.*]], i64 0) @@ -29,7 +36,7 @@ // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP1]], [[X2:%.*]], i64 32) // CPP-CHECK-NEXT: ret [[TMP2]] // -svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2) +svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_s8,,)(x0, x1, x2); } @@ -48,7 +55,7 @@ svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP1]], [[X2:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP2]] // -svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2) +svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_s16,,)(x0, x1, x2); } @@ -67,7 +74,7 @@ svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP1]], [[X2:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP2]] // -svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2) +svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_s32,,)(x0, x1, x2); } @@ -86,7 +93,7 @@ svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP1]], [[X2:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP2]] // -svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2) +svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_s64,,)(x0, x1, x2); } @@ -105,7 +112,7 @@ svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP1]], [[X2:%.*]], i64 32) // CPP-CHECK-NEXT: ret [[TMP2]] // -svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2) +svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_u8,,)(x0, x1, x2); } @@ -124,7 +131,7 @@ svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP1]], [[X2:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP2]] // -svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2) +svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_u16,,)(x0, x1, x2); } @@ -143,7 +150,7 @@ svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP1]], [[X2:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP2]] // -svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2) +svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_u32,,)(x0, x1, x2); } @@ -162,7 +169,7 @@ svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP1]], [[X2:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP2]] // -svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2) +svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_u64,,)(x0, x1, x2); } @@ -181,7 +188,7 @@ svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP1]], [[X2:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP2]] // -svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2) +svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_f16,,)(x0, x1, x2); } @@ -200,7 +207,7 @@ svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP1]], [[X2:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP2]] // -svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2) +svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_f32,,)(x0, x1, x2); } @@ -219,7 +226,7 @@ svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP1]], [[X2:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP2]] // -svfloat64x3_t test_svcreate3_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2) +svfloat64x3_t test_svcreate3_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2) ATTR { return SVE_ACLE_FUNC(svcreate3,_f64,,)(x0, x1, x2); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c index e3af56c5d133..ca90a435af8c 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate4_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[X0:%.*]], i64 0) @@ -31,7 +38,7 @@ // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[X4:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP3]] // -svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4) +svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_bf16,,)(x0, x1, x2, x4); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c index 7e0c2a7f00d1..de8a5b061657 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svcreate4_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[X0:%.*]], i64 0) @@ -31,7 +38,7 @@ // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[X4:%.*]], i64 48) // CPP-CHECK-NEXT: ret [[TMP3]] // -svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4) +svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_s8,,)(x0, x1, x2, x4); } @@ -52,7 +59,7 @@ svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[X4:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP3]] // -svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16_t x4) +svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_s16,,)(x0, x1, x2, x4); } @@ -73,7 +80,7 @@ svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[X4:%.*]], i64 12) // CPP-CHECK-NEXT: ret [[TMP3]] // -svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32_t x4) +svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_s32,,)(x0, x1, x2, x4); } @@ -94,7 +101,7 @@ svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[X4:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP3]] // -svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64_t x4) +svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_s64,,)(x0, x1, x2, x4); } @@ -115,7 +122,7 @@ svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[X4:%.*]], i64 48) // CPP-CHECK-NEXT: ret [[TMP3]] // -svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_t x4) +svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_u8,,)(x0, x1, x2, x4); } @@ -136,7 +143,7 @@ svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_ // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[X4:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP3]] // -svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svuint16_t x4) +svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svuint16_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_u16,,)(x0, x1, x2, x4); } @@ -157,7 +164,7 @@ svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svu // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[X4:%.*]], i64 12) // CPP-CHECK-NEXT: ret [[TMP3]] // -svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svuint32_t x4) +svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svuint32_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_u32,,)(x0, x1, x2, x4); } @@ -178,7 +185,7 @@ svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svu // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[X4:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP3]] // -svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svuint64_t x4) +svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svuint64_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_u64,,)(x0, x1, x2, x4); } @@ -199,7 +206,7 @@ svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svu // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[X4:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP3]] // -svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2, svfloat16_t x4) +svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2, svfloat16_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_f16,,)(x0, x1, x2, x4); } @@ -220,7 +227,7 @@ svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2, // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[X4:%.*]], i64 12) // CPP-CHECK-NEXT: ret [[TMP3]] // -svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2, svfloat32_t x4) +svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2, svfloat32_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_f32,,)(x0, x1, x2, x4); } @@ -241,7 +248,7 @@ svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2, // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[X4:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP3]] // -svfloat64x4_t test_svcreate4_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2, svfloat64_t x4) +svfloat64x4_t test_svcreate4_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2, svfloat64_t x4) ATTR { return SVE_ACLE_FUNC(svcreate4,_f64,,)(x0, x1, x2, x4); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c index 25dc49a4c2bd..b9c46b2261f5 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svget2_bf16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TUPLE:%.*]], i64 0) @@ -25,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) +svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 0); } @@ -40,7 +47,7 @@ svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple) +svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 1); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c index 32a84c91b74d..8cd887aaff40 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS @@ -14,6 +15,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svget2_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[TUPLE:%.*]], i64 0) @@ -24,7 +31,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svget2_s8(svint8x2_t tuple) +svint8_t test_svget2_s8(svint8x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_s8,,)(tuple, 0); } @@ -39,7 +46,7 @@ svint8_t test_svget2_s8(svint8x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16_t test_svget2_s16(svint16x2_t tuple) +svint16_t test_svget2_s16(svint16x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_s16,,)(tuple, 1); } @@ -54,7 +61,7 @@ svint16_t test_svget2_s16(svint16x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32_t test_svget2_s32(svint32x2_t tuple) +svint32_t test_svget2_s32(svint32x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_s32,,)(tuple, 0); } @@ -69,7 +76,7 @@ svint32_t test_svget2_s32(svint32x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[TUPLE:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64_t test_svget2_s64(svint64x2_t tuple) +svint64_t test_svget2_s64(svint64x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_s64,,)(tuple, 1); } @@ -84,7 +91,7 @@ svint64_t test_svget2_s64(svint64x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svget2_u8(svuint8x2_t tuple) +svuint8_t test_svget2_u8(svuint8x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_u8,,)(tuple, 0); } @@ -99,7 +106,7 @@ svuint8_t test_svget2_u8(svuint8x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16_t test_svget2_u16(svuint16x2_t tuple) +svuint16_t test_svget2_u16(svuint16x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_u16,,)(tuple, 1); } @@ -114,7 +121,7 @@ svuint16_t test_svget2_u16(svuint16x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32_t test_svget2_u32(svuint32x2_t tuple) +svuint32_t test_svget2_u32(svuint32x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_u32,,)(tuple, 0); } @@ -129,7 +136,7 @@ svuint32_t test_svget2_u32(svuint32x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[TUPLE:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64_t test_svget2_u64(svuint64x2_t tuple) +svuint64_t test_svget2_u64(svuint64x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_u64,,)(tuple, 1); } @@ -144,7 +151,7 @@ svuint64_t test_svget2_u64(svuint64x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16_t test_svget2_f16(svfloat16x2_t tuple) +svfloat16_t test_svget2_f16(svfloat16x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_f16,,)(tuple, 0); } @@ -159,7 +166,7 @@ svfloat16_t test_svget2_f16(svfloat16x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[TUPLE:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32_t test_svget2_f32(svfloat32x2_t tuple) +svfloat32_t test_svget2_f32(svfloat32x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_f32,,)(tuple, 1); } @@ -174,7 +181,7 @@ svfloat32_t test_svget2_f32(svfloat32x2_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64_t test_svget2_f64(svfloat64x2_t tuple) +svfloat64_t test_svget2_f64(svfloat64x2_t tuple) ATTR { return SVE_ACLE_FUNC(svget2,_f64,,)(tuple, 0); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c index 47ce6bd19244..7a991bc7431d 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svget3_bf16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[TUPLE:%.*]], i64 0) @@ -25,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) +svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 0); } @@ -40,7 +47,7 @@ svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) +svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 1); } @@ -55,7 +62,7 @@ svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[TUPLE:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple) +svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 2); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c index 54847152dee7..de7c3c303ffc 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c @@ -5,6 +5,8 @@ // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + #include #ifdef SVE_OVERLOADED_FORMS @@ -14,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svget3_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[TUPLE:%.*]], i64 0) @@ -24,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svget3_s8(svint8x3_t tuple) +svint8_t test_svget3_s8(svint8x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_s8,,)(tuple, 0); } @@ -39,7 +47,7 @@ svint8_t test_svget3_s8(svint8x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[TUPLE:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16_t test_svget3_s16(svint16x3_t tuple) +svint16_t test_svget3_s16(svint16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_s16,,)(tuple, 2); } @@ -54,7 +62,7 @@ svint16_t test_svget3_s16(svint16x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[TUPLE:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32_t test_svget3_s32(svint32x3_t tuple) +svint32_t test_svget3_s32(svint32x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_s32,,)(tuple, 1); } @@ -69,7 +77,7 @@ svint32_t test_svget3_s32(svint32x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64_t test_svget3_s64(svint64x3_t tuple) +svint64_t test_svget3_s64(svint64x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_s64,,)(tuple, 0); } @@ -84,7 +92,7 @@ svint64_t test_svget3_s64(svint64x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[TUPLE:%.*]], i64 32) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svget3_u8(svuint8x3_t tuple) +svuint8_t test_svget3_u8(svuint8x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_u8,,)(tuple, 2); } @@ -99,7 +107,7 @@ svuint8_t test_svget3_u8(svuint8x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16_t test_svget3_u16(svuint16x3_t tuple) +svuint16_t test_svget3_u16(svuint16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_u16,,)(tuple, 1); } @@ -114,7 +122,7 @@ svuint16_t test_svget3_u16(svuint16x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32_t test_svget3_u32(svuint32x3_t tuple) +svuint32_t test_svget3_u32(svuint32x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_u32,,)(tuple, 0); } @@ -129,7 +137,7 @@ svuint32_t test_svget3_u32(svuint32x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[TUPLE:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64_t test_svget3_u64(svuint64x3_t tuple) +svuint64_t test_svget3_u64(svuint64x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_u64,,)(tuple, 2); } @@ -144,7 +152,7 @@ svuint64_t test_svget3_u64(svuint64x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16_t test_svget3_f16(svfloat16x3_t tuple) +svfloat16_t test_svget3_f16(svfloat16x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_f16,,)(tuple, 1); } @@ -159,7 +167,7 @@ svfloat16_t test_svget3_f16(svfloat16x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32_t test_svget3_f32(svfloat32x3_t tuple) +svfloat32_t test_svget3_f32(svfloat32x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_f32,,)(tuple, 0); } @@ -174,7 +182,7 @@ svfloat32_t test_svget3_f32(svfloat32x3_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[TUPLE:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64_t test_svget3_f64(svfloat64x3_t tuple) +svfloat64_t test_svget3_f64(svfloat64x3_t tuple) ATTR { return SVE_ACLE_FUNC(svget3,_f64,,)(tuple, 2); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c index 454b3bf38bd3..3a5e282bfdfa 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svget4_bf16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[TUPLE:%.*]], i64 0) @@ -25,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) +svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 0); } @@ -40,7 +47,7 @@ svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) +svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 1); } @@ -55,7 +62,7 @@ svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[TUPLE:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) +svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 2); } @@ -70,7 +77,7 @@ svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[TUPLE:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple) +svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 3); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c index 13f8c2a2906e..9b4f9e5332a5 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include #ifdef SVE_OVERLOADED_FORMS @@ -14,6 +15,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // NOTE: For these tests clang converts the struct parameter into // several parameters, one for each member of the original struct. // CHECK-LABEL: @test_svget4_s8( @@ -26,7 +33,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8_t test_svget4_s8(svint8x4_t tuple) +svint8_t test_svget4_s8(svint8x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_s8,,)(tuple, 0); } @@ -41,7 +48,7 @@ svint8_t test_svget4_s8(svint8x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[TUPLE:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16_t test_svget4_s16(svint16x4_t tuple) +svint16_t test_svget4_s16(svint16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_s16,,)(tuple, 2); } @@ -56,7 +63,7 @@ svint16_t test_svget4_s16(svint16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[TUPLE:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32_t test_svget4_s32(svint32x4_t tuple) +svint32_t test_svget4_s32(svint32x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_s32,,)(tuple, 2); } @@ -71,7 +78,7 @@ svint32_t test_svget4_s32(svint32x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[TUPLE:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64_t test_svget4_s64(svint64x4_t tuple) +svint64_t test_svget4_s64(svint64x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_s64,,)(tuple, 3); } @@ -86,7 +93,7 @@ svint64_t test_svget4_s64(svint64x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[TUPLE:%.*]], i64 32) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8_t test_svget4_u8(svuint8x4_t tuple) +svuint8_t test_svget4_u8(svuint8x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_u8,,)(tuple, 2); } @@ -101,7 +108,7 @@ svuint8_t test_svget4_u8(svuint8x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[TUPLE:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16_t test_svget4_u16(svuint16x4_t tuple) +svuint16_t test_svget4_u16(svuint16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_u16,,)(tuple, 3); } @@ -116,7 +123,7 @@ svuint16_t test_svget4_u16(svuint16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32_t test_svget4_u32(svuint32x4_t tuple) +svuint32_t test_svget4_u32(svuint32x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_u32,,)(tuple, 0); } @@ -131,7 +138,7 @@ svuint32_t test_svget4_u32(svuint32x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[TUPLE:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64_t test_svget4_u64(svuint64x4_t tuple) +svuint64_t test_svget4_u64(svuint64x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_u64,,)(tuple, 3); } @@ -146,7 +153,7 @@ svuint64_t test_svget4_u64(svuint64x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[TUPLE:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16_t test_svget4_f16(svfloat16x4_t tuple) +svfloat16_t test_svget4_f16(svfloat16x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_f16,,)(tuple, 2); } @@ -161,7 +168,7 @@ svfloat16_t test_svget4_f16(svfloat16x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[TUPLE:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32_t test_svget4_f32(svfloat32x4_t tuple) +svfloat32_t test_svget4_f32(svfloat32x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_f32,,)(tuple, 0); } @@ -176,7 +183,7 @@ svfloat32_t test_svget4_f32(svfloat32x4_t tuple) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[TUPLE:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64_t test_svget4_f64(svfloat64x4_t tuple) +svfloat64_t test_svget4_f64(svfloat64x4_t tuple) ATTR { return SVE_ACLE_FUNC(svget4,_f64,,)(tuple, 2); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c index f6217977d26a..a92e5aedb59f 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svset2_bf16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 0) @@ -25,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) +svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 0, x); } @@ -40,7 +47,7 @@ svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x) +svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 1, x); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c index 9ae301154920..b2bf4ad08aa9 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,12 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif + // CHECK-LABEL: @test_svset2_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 16) @@ -25,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x) +svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x) ATTR { return SVE_ACLE_FUNC(svset2,_s8,,)(tuple, 1, x); } @@ -40,7 +47,7 @@ svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x) +svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x) ATTR { return SVE_ACLE_FUNC(svset2,_s16,,)(tuple, 0, x); } @@ -55,7 +62,7 @@ svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x) +svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x) ATTR { return SVE_ACLE_FUNC(svset2,_s32,,)(tuple, 1, x); } @@ -70,7 +77,7 @@ svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x) +svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x) ATTR { return SVE_ACLE_FUNC(svset2,_s64,,)(tuple, 0, x); } @@ -85,7 +92,7 @@ svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x) +svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x) ATTR { return SVE_ACLE_FUNC(svset2,_u8,,)(tuple, 1, x); } @@ -100,7 +107,7 @@ svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x) +svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x) ATTR { return SVE_ACLE_FUNC(svset2,_u16,,)(tuple, 0, x); } @@ -115,7 +122,7 @@ svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x) +svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x) ATTR { return SVE_ACLE_FUNC(svset2,_u32,,)(tuple, 1, x); } @@ -130,7 +137,7 @@ svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x) +svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x) ATTR { return SVE_ACLE_FUNC(svset2,_u64,,)(tuple, 0, x); } @@ -145,7 +152,7 @@ svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x) +svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset2,_f16,,)(tuple, 1, x); } @@ -160,7 +167,7 @@ svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x) +svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x) ATTR { return SVE_ACLE_FUNC(svset2,_f32,,)(tuple, 0, x); } @@ -175,7 +182,7 @@ svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TUPLE:%.*]], [[X:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64x2_t test_svset2_f64(svfloat64x2_t tuple, svfloat64_t x) +svfloat64x2_t test_svset2_f64(svfloat64x2_t tuple, svfloat64_t x) ATTR { return SVE_ACLE_FUNC(svset2,_f64,,)(tuple, 1, x); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c index aeef3ab59ee0..3bc8698ef89b 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,11 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif // CHECK-LABEL: @test_svset3_bf16_0( // CHECK-NEXT: entry: @@ -26,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) +svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 0, x); } @@ -41,7 +47,7 @@ svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) +svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 1, x); } @@ -56,7 +62,7 @@ svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x) +svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 2, x); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c index 1b9191cc8a33..9d10e6afca93 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,11 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif // NOTE: For these tests clang converts the struct parameter into // several parameters, one for each member of the original struct. @@ -28,7 +34,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x) +svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x) ATTR { return SVE_ACLE_FUNC(svset3,_s8,,)(tuple, 1, x); } @@ -43,7 +49,7 @@ svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x) +svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_s16,,)(tuple, 2, x); } @@ -58,7 +64,7 @@ svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x) +svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x) ATTR { return SVE_ACLE_FUNC(svset3,_s32,,)(tuple, 0, x); } @@ -73,7 +79,7 @@ svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x) +svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x) ATTR { return SVE_ACLE_FUNC(svset3,_s64,,)(tuple, 1, x); } @@ -88,7 +94,7 @@ svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 32) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x) +svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x) ATTR { return SVE_ACLE_FUNC(svset3,_u8,,)(tuple, 2, x); } @@ -103,7 +109,7 @@ svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x) +svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_u16,,)(tuple, 0, x); } @@ -118,7 +124,7 @@ svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x) +svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x) ATTR { return SVE_ACLE_FUNC(svset3,_u32,,)(tuple, 1, x); } @@ -133,7 +139,7 @@ svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x) +svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x) ATTR { return SVE_ACLE_FUNC(svset3,_u64,,)(tuple, 2, x); } @@ -148,7 +154,7 @@ svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x) +svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset3,_f16,,)(tuple, 0, x); } @@ -163,7 +169,7 @@ svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x) +svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x) ATTR { return SVE_ACLE_FUNC(svset3,_f32,,)(tuple, 1, x); } @@ -178,7 +184,7 @@ svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64x3_t test_svset3_f64(svfloat64x3_t tuple, svfloat64_t x) +svfloat64x3_t test_svset3_f64(svfloat64x3_t tuple, svfloat64_t x) ATTR { return SVE_ACLE_FUNC(svset3,_f64,,)(tuple, 2, x); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c index 013f591e8d72..31ca805dafc7 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,11 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif // CHECK-LABEL: @test_svset4_bf16_0( // CHECK-NEXT: entry: @@ -26,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 0) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) +svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 0, x); } @@ -41,7 +47,7 @@ svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) +svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 1, x); } @@ -56,7 +62,7 @@ svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) +svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 2, x); } @@ -71,7 +77,7 @@ svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TUPLE:%.*]], [[X:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP0]] // -svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x) +svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 3, x); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c index e4ece8c2a65f..ce35bfb83c88 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target @@ -15,6 +16,11 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif +#ifndef TEST_SME +#define ATTR +#else +#define ATTR __arm_streaming +#endif // CHECK-LABEL: @test_svset4_s8( // CHECK-NEXT: entry: @@ -26,7 +32,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 16) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x) +svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x) ATTR { return SVE_ACLE_FUNC(svset4,_s8,,)(tuple, 1, x); } @@ -41,7 +47,7 @@ svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 24) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x) +svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_s16,,)(tuple, 3, x); } @@ -56,7 +62,7 @@ svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x) +svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x) ATTR { return SVE_ACLE_FUNC(svset4,_s32,,)(tuple, 1, x); } @@ -71,7 +77,7 @@ svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 2) // CPP-CHECK-NEXT: ret [[TMP0]] // -svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x) +svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x) ATTR { return SVE_ACLE_FUNC(svset4,_s64,,)(tuple, 1, x); } @@ -86,7 +92,7 @@ svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TUPLE:%.*]], [[X:%.*]], i64 48) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x) +svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x) ATTR { return SVE_ACLE_FUNC(svset4,_u8,,)(tuple, 3, x); } @@ -101,7 +107,7 @@ svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x) +svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_u16,,)(tuple, 1, x); } @@ -116,7 +122,7 @@ svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x) +svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x) ATTR { return SVE_ACLE_FUNC(svset4,_u32,,)(tuple, 1, x); } @@ -131,7 +137,7 @@ svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TUPLE:%.*]], [[X:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP0]] // -svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x) +svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x) ATTR { return SVE_ACLE_FUNC(svset4,_u64,,)(tuple, 3, x); } @@ -146,7 +152,7 @@ svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TUPLE:%.*]], [[X:%.*]], i64 8) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x) +svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x) ATTR { return SVE_ACLE_FUNC(svset4,_f16,,)(tuple, 1, x); } @@ -161,7 +167,7 @@ svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TUPLE:%.*]], [[X:%.*]], i64 4) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x) +svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x) ATTR { return SVE_ACLE_FUNC(svset4,_f32,,)(tuple, 1, x); } @@ -176,7 +182,7 @@ svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TUPLE:%.*]], [[X:%.*]], i64 6) // CPP-CHECK-NEXT: ret [[TMP0]] // -svfloat64x4_t test_svset4_f64(svfloat64x4_t tuple, svfloat64_t x) +svfloat64x4_t test_svset4_f64(svfloat64x4_t tuple, svfloat64_t x) ATTR { return SVE_ACLE_FUNC(svset4,_f64,,)(tuple, 3, x); } -- Gitee From e800212663084e43e508e1a38cbb08c7d6ce8879 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 19 Jan 2024 13:48:44 +0000 Subject: [PATCH 57/77] [AArch64][SME] Remove combination of private-ZA and preserves_za. (#78563) The new Clang attributes no longer support the combination of having a private-ZA function that preserves ZA. The use of __arm_preserves("za") means that ZA is shared and preserved. There wasn't that much benefit to the special handling of this, because in practice it only meant that we'd avoid restoring the lazy-save afterwards, but it still needed setting up a lazy-save (with the possibility of using a 0-sized buffer). Perhaps a new attribute will be added in the future to support this case, at which point we can revert back some of the changes removed in this patch. But for now removing this code simplifies things. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> --- .../Target/AArch64/AArch64ISelLowering.cpp | 76 ++++++++----------- .../AArch64/sme-lazy-save-call-remarks.ll | 15 +--- .../CodeGen/AArch64/sme-lazy-save-call.ll | 46 ----------- 3 files changed, 34 insertions(+), 103 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d3901659d7d5..2bc78a3f1450 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7386,16 +7386,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - SDValue NumZaSaveSlices; - if (!CalleeAttrs.preservesZA()) { - // Set up a lazy save mechanism by storing the runtime live slices - // (worst-case SVL) to the TPIDR2 stack object. - NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - } else if (CalleeAttrs.preservesZA()) { - NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64); - } - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, @@ -7403,6 +7393,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); + SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, MPI, MVT::i16); Chain = DAG.getNode( @@ -7415,14 +7407,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, CLI.CB) : OptimizationRemarkAnalysis("sme", "SMELazySaveZA", &MF.getFunction()); - DescribeCallsite(R) << " sets up a lazy save for ZA"; - if (CalleeAttrs.preservesZA()) - R << ", but callee preserves ZA, so we request 0 slices to be saved"; - else - R << ", and we request that all slices be saved"; - R << ore::setExtraArgs() - << ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA()); - return R; + return DescribeCallsite(R) << " sets up a lazy save for ZA"; }); } @@ -7858,34 +7843,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (RequiresLazySave) { - if (!CalleeAttrs.preservesZA()) { - // Unconditionally resume ZA. - Result = DAG.getNode( - AArch64ISD::SMSTART, DL, MVT::Other, Result, - DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); - - // Conditionally restore the lazy save using a pseudo node. - unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, - DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, RegMask, Result.getValue(1)}); - } + // Unconditionally resume ZA. + Result = DAG.getNode( + AArch64ISD::SMSTART, DL, MVT::Other, Result, + DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + + // Conditionally restore the lazy save using a pseudo node. + unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask( + TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); + Result = + DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Result.getValue(1)}); + // Finally reset the TPIDR2_EL0 register to 0. Result = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Result, diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll index 6762a768fd5b..d999311301f9 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll @@ -2,31 +2,24 @@ ; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s declare void @private_za_callee() -declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved" declare float @llvm.cos.f32(float) define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { -; CHECK: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved +; CHECK: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA call void @private_za_callee() ret void } define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { -; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved +; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA call void @private_za_callee() -; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved +; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA call void @private_za_callee() ret void } -define void @test_lazy_save_preserved_callee() nounwind "aarch64_pstate_za_shared" { -; CHECK: remark: :0:0: call from 'test_lazy_save_preserved_callee' to 'private_za_preserved_callee' sets up a lazy save for ZA, but callee preserves ZA, so we request 0 slices to be saved - call void @private_za_preserved_callee() - ret void -} - define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" { -; CHECK: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA, and we request that all slices be saved +; CHECK: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA %res = call float @llvm.cos.f32(float %a) ret float %res } diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 5db2a1914ed1..7eb7eb21ca59 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -2,7 +2,6 @@ ; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s declare void @private_za_callee() -declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved" declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. @@ -170,48 +169,3 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z call void @private_za_callee() ret void } - - -; Test lazy-save mechanism for an aarch64_pstate_za_shared caller -; calling a callee with aarch64_pstate_za_preserved. -define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: za_shared_caller_za_preserved_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: stp x8, xzr, [x29, #-80] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: and x19, x0, #0x1 -; CHECK-NEXT: tbz w19, #0, .LBB4_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: smstop sm -; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: bl private_za_preserved_callee -; CHECK-NEXT: tbz w19, #0, .LBB4_4 -; CHECK-NEXT: // %bb.3: -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEXT: ret - call void @private_za_preserved_callee() - ret void -} -- Gitee From 46cc419a70aa0436d5cd5eba4ddfd8226799b39c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 19 Jan 2024 16:02:24 +0000 Subject: [PATCH 58/77] [Clang] Refactor diagnostics for SME builtins. (#78258) The arm_sme.td file was still using `IsSharedZA` and `IsPreservesZA`, which should be changed to match the new state attributes added in This patch adds `IsInZA`, `IsOutZA` and `IsInOutZA` as the state for the Clang builtins and fixes up the code in SemaChecking and SveEmitter to match. Note that the code is written in such a way that it can be easily extended with ZT0 state (to follow in a future patch). Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 86 +++++++++---------- clang/include/clang/Basic/arm_sve_sme_incl.td | 9 +- clang/lib/Sema/SemaChecking.cpp | 35 ++++---- .../Sema/aarch64-incompat-sm-builtin-calls.c | 6 ++ clang/utils/TableGen/SveEmitter.cpp | 22 ++--- 5 files changed, 84 insertions(+), 74 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index b5655afdf419..27dde7e84e96 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -21,19 +21,19 @@ include "arm_sve_sme_incl.td" multiclass ZALoad ch> { let TargetGuard = "sme" in { def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t, - [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], + [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA], MemEltTyDefault, i_prefix # "_horiz", ch>; def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t, - [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], + [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA], MemEltTyDefault, i_prefix # "_horiz", ch>; def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t, - [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], + [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA], MemEltTyDefault, i_prefix # "_vert", ch>; def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t, - [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA], + [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA], MemEltTyDefault, i_prefix # "_vert", ch>; } } @@ -45,11 +45,11 @@ defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0 defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>; def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA], + [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr">; def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA], + [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr", []>; //////////////////////////////////////////////////////////////////////////////// @@ -58,19 +58,19 @@ def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", multiclass ZAStore ch> { let TargetGuard = "sme" in { def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t, - [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], + [IsStore, IsOverloadNone, IsStreaming, IsInZA], MemEltTyDefault, i_prefix # "_horiz", ch>; def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t, - [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], + [IsStore, IsOverloadNone, IsStreaming, IsInZA], MemEltTyDefault, i_prefix # "_horiz", ch>; def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t, - [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], + [IsStore, IsOverloadNone, IsStreaming, IsInZA], MemEltTyDefault, i_prefix # "_vert", ch>; def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t, - [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA], + [IsStore, IsOverloadNone, IsStreaming, IsInZA], MemEltTyDefault, i_prefix # "_vert", ch>; } } @@ -82,11 +82,11 @@ defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>; def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], + [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str">; def SVSTR_ZA : MInst<"svstr_za", "vm%", "", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], + [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str", []>; //////////////////////////////////////////////////////////////////////////////// @@ -96,11 +96,11 @@ multiclass ZARead ch> let TargetGuard = "sme" in { def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t, MergeOp1, i_prefix # "_horiz", - [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; + [IsReadZA, IsStreaming, IsInZA], ch>; def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t, MergeOp1, i_prefix # "_vert", - [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; + [IsReadZA, IsStreaming, IsInZA], ch>; } } @@ -117,11 +117,11 @@ multiclass ZAWrite ch let TargetGuard = "sme" in { def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t, MergeOp1, i_prefix # "_horiz", - [IsWriteZA, IsStreaming, IsSharedZA], ch>; + [IsWriteZA, IsStreaming, IsInOutZA], ch>; def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t, MergeOp1, i_prefix # "_vert", - [IsWriteZA, IsStreaming, IsSharedZA], ch>; + [IsWriteZA, IsStreaming, IsInOutZA], ch>; } } @@ -136,10 +136,10 @@ defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq", let TargetGuard = "sme" in { def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA], + [IsOverloadNone, IsStreamingCompatible, IsInOutZA], [ImmCheck<0, ImmCheck0_255>]>; def SVZERO_ZA : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero", - [IsOverloadNone, IsStreamingCompatible, IsSharedZA]>; + [IsOverloadNone, IsStreamingCompatible, IsOutZA]>; } //////////////////////////////////////////////////////////////////////////////// @@ -149,7 +149,7 @@ multiclass ZACount { let TargetGuard = "sme" in { def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone, "aarch64_sme_" # n_suffix, - [IsOverloadNone, IsStreamingCompatible, IsPreservesZA]>; + [IsOverloadNone, IsStreamingCompatible]>; } } @@ -164,13 +164,13 @@ defm SVCNTSD : ZACount<"cntsd">; multiclass ZAAdd { let TargetGuard = "sme" in { def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1, - "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], + "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-i16i64" in { def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1, - "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], + "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_7>]>; } } @@ -186,7 +186,7 @@ multiclass ZAIntOuterProd { def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]", "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c", MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; } @@ -194,7 +194,7 @@ multiclass ZAIntOuterProd { def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]", "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "s", MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_7>]>; } } @@ -213,7 +213,7 @@ multiclass ZAIntOuterProdMixedSigns { "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"), !cond(!eq(n_suffix1, "su") : "", true: "U") # "c", MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; } @@ -222,7 +222,7 @@ multiclass ZAIntOuterProdMixedSigns { "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"), !cond(!eq(n_suffix1, "su") : "", true: "U") # "s", MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_7>]>; } } @@ -239,24 +239,24 @@ multiclass ZAFPOuterProd { let TargetGuard = "sme" in { def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h", MergeOp1, "aarch64_sme_" # n_suffix # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b", MergeOp1, "aarch64_sme_" # n_suffix # "_wide", - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f", MergeOp1, "aarch64_sme_" # n_suffix, - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-f64f64" in { def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d", MergeOp1, "aarch64_sme_" # n_suffix, - [IsStreaming, IsSharedZA], + [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_7>]>; } } @@ -269,29 +269,29 @@ defm SVMOPS : ZAFPOuterProd<"mops">; multiclass ZAAddSub { let TargetGuard = "sme2" in { - def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>; let TargetGuard = "sme-i16i64" in { - def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } let TargetGuard = "sme-f64f64" in { - def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>; - def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>; + def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } } } diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index d97d8ea0fac5..75c1f05e1f2f 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -218,10 +218,11 @@ def ReverseMergeAnyBinOp : FlagType<0x800000000>; // e.g. Implement SUBR_X def ReverseMergeAnyAccOp : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X. def IsStreaming : FlagType<0x2000000000>; def IsStreamingCompatible : FlagType<0x4000000000>; -def IsSharedZA : FlagType<0x8000000000>; -def IsPreservesZA : FlagType<0x10000000000>; -def IsReadZA : FlagType<0x20000000000>; -def IsWriteZA : FlagType<0x40000000000>; +def IsReadZA : FlagType<0x8000000000>; +def IsWriteZA : FlagType<0x10000000000>; +def IsInZA : FlagType<0x20000000000>; +def IsOutZA : FlagType<0x40000000000>; +def IsInOutZA : FlagType<0x80000000000>; // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h class ImmCheckType { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index c21395515f7c..96356f1bdca2 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2895,6 +2895,15 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context, enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible }; +enum ArmSMEState : unsigned { + ArmNoState = 0, + + ArmInZA = 0b01, + ArmOutZA = 0b10, + ArmInOutZA = 0b11, + ArmZAMask = 0b11, +}; + bool Sema::ParseSVEImmChecks( CallExpr *TheCall, SmallVector, 3> &ImmChecks) { // Perform all the immediate checks for this builtin call. @@ -3052,26 +3061,20 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, } } -static bool hasSMEZAState(const FunctionDecl *FD) { - if (auto *Attr = FD->getAttr()) - if (Attr->isNewZA()) - return true; - if (const auto *T = FD->getType()->getAs()) { - FunctionType::ArmStateValue State = - FunctionType::getArmZAState(T->getAArch64SMEAttributes()); - if (State != FunctionType::ARM_None) - return true; - } - return false; +static bool hasArmZAState(const FunctionDecl *FD) { + const auto *T = FD->getType()->getAs(); + return (T && FunctionType::getArmZAState(T->getAArch64SMEAttributes()) != + FunctionType::ARM_None) || + (FD->hasAttr() && FD->getAttr()->isNewZA()); } -static bool hasSMEZAState(unsigned BuiltinID) { +static ArmSMEState getSMEState(unsigned BuiltinID) { switch (BuiltinID) { default: - return false; -#define GET_SME_BUILTIN_HAS_ZA_STATE + return ArmNoState; +#define GET_SME_BUILTIN_GET_STATE #include "clang/Basic/arm_sme_builtins_za_state.inc" -#undef GET_SME_BUILTIN_HAS_ZA_STATE +#undef GET_SME_BUILTIN_GET_STATE } } @@ -3088,7 +3091,7 @@ bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { if (BuiltinType) checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); - if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD)) + if ((getSMEState(BuiltinID) & ArmZAMask) && !hasArmZAState(FD)) Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_za_builtin_no_za_state) << TheCall->getSourceRange(); diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 476da8534ce7..9e4b3920e543 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -102,3 +102,9 @@ svint8_t missing_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streami // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} return svread_hor_za8_s8_m(zd, pg, 0, slice_base); } + +__arm_new("za") +svint8_t new_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming { + // expected-no-warning + return svread_hor_za8_s8_m(zd, pg, 0, slice_base); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 3828ad27b117..ab87d91625b9 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1642,21 +1642,21 @@ void SVEEmitter::createBuiltinZAState(raw_ostream &OS) { for (auto *R : RV) createIntrinsic(R, Defs); - std::map> DefsZAState; - - uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA"); + std::map> IntrinsicsPerState; for (auto &Def : Defs) { - bool HasZAState = Def->isFlagSet(IsSharedZAFlag); - DefsZAState[HasZAState].insert(Def->getMangledName()); + if (Def->isFlagSet(getEnumValueForFlag("IsInZA"))) + IntrinsicsPerState["ArmInZA"].insert(Def->getMangledName()); + else if (Def->isFlagSet(getEnumValueForFlag("IsOutZA"))) + IntrinsicsPerState["ArmOutZA"].insert(Def->getMangledName()); + else if (Def->isFlagSet(getEnumValueForFlag("IsInOutZA"))) + IntrinsicsPerState["ArmInOutZA"].insert(Def->getMangledName()); } - OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n"; - - for (auto HasZA : {true, false}) { - auto Names = DefsZAState[HasZA]; - for (auto Name : Names) + OS << "#ifdef GET_SME_BUILTIN_GET_STATE\n"; + for (auto &KV : IntrinsicsPerState) { + for (StringRef Name : KV.second) OS << "case SME::BI__builtin_sme_" << Name << ":\n"; - OS << " return " << (HasZA ? "true" : "false") << ";\n"; + OS << " return " << KV.first << ";\n"; } OS << "#endif\n\n"; } -- Gitee From 471013ea908f72b5e8120daacd40680c1da42abf Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 19 Jan 2024 16:15:38 +0000 Subject: [PATCH 59/77] [AArch64] NFC: Simplify discombobulating 'requiresSMChange' interface (#78703) Having it return a `std::optional` is unnecessarily confusing. This patch changes it to a simple 'bool'. This patch also removes the 'BodyOverridesInterface' operand because there is only a single use for this which is easily rewritten. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 15 +++-- .../AArch64/AArch64TargetTransformInfo.cpp | 5 +- .../AArch64/Utils/AArch64SMEAttributes.cpp | 20 ++----- .../AArch64/Utils/AArch64SMEAttributes.h | 9 +-- .../Target/AArch64/SMEAttributesTest.cpp | 59 ++++--------------- 5 files changed, 28 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2bc78a3f1450..875cedee1881 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7412,8 +7412,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } SDValue PStateSM; - std::optional RequiresSMChange = - CallerAttrs.requiresSMChange(CalleeAttrs); + bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); if (RequiresSMChange) { if (CallerAttrs.hasStreamingInterfaceOrBody()) PStateSM = DAG.getConstant(1, DL, MVT::i64); @@ -7687,8 +7686,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue InGlue; if (RequiresSMChange) { - SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, - InGlue, PStateSM, true); + SDValue NewChain = + changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, + InGlue, PStateSM, true); Chain = NewChain.getValue(0); InGlue = NewChain.getValue(1); } @@ -7838,8 +7838,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresSMChange) { assert(PStateSM && "Expected a PStateSM to be set"); - Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue, - PStateSM, false); + Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(), + Result, InGlue, PStateSM, false); } if (RequiresLazySave) { @@ -24932,8 +24932,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (auto *Base = dyn_cast(&Inst)) { auto CallerAttrs = SMEAttrs(*Inst.getFunction()); auto CalleeAttrs = SMEAttrs(*Base); - if (CallerAttrs.requiresSMChange(CalleeAttrs, - /*BodyOverridesInterface=*/false) || + if (CallerAttrs.requiresSMChange(CalleeAttrs) || CallerAttrs.requiresLazySave(CalleeAttrs)) return true; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 17d42889d63c..2aaf2194ccaf 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -227,8 +227,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, return false; if (CallerAttrs.requiresLazySave(CalleeAttrs) || - CallerAttrs.requiresSMChange(CalleeAttrs, - /*BodyOverridesInterface=*/true)) { + (CallerAttrs.requiresSMChange(CalleeAttrs) && + (!CallerAttrs.hasStreamingInterfaceOrBody() || + !CalleeAttrs.hasStreamingBody()))) { if (hasPossibleIncompatibleOps(Callee)) return false; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 0082b4017986..05d70ab76ede 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -62,27 +62,17 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= ZA_Preserved; } -std::optional -SMEAttrs::requiresSMChange(const SMEAttrs &Callee, - bool BodyOverridesInterface) const { - // If the transition is not through a call (e.g. when considering inlining) - // and Callee has a streaming body, then we can ignore the interface of - // Callee. - if (BodyOverridesInterface && Callee.hasStreamingBody()) { - return hasStreamingInterfaceOrBody() ? std::nullopt - : std::optional(true); - } - +bool SMEAttrs::requiresSMChange(const SMEAttrs &Callee) const { if (Callee.hasStreamingCompatibleInterface()) - return std::nullopt; + return false; // Both non-streaming if (hasNonStreamingInterfaceAndBody() && Callee.hasNonStreamingInterface()) - return std::nullopt; + return false; // Both streaming if (hasStreamingInterfaceOrBody() && Callee.hasStreamingInterface()) - return std::nullopt; + return false; - return Callee.hasStreamingInterface(); + return true; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index e766b778b541..93ea8eedb811 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -65,14 +65,7 @@ public: /// \return true if a call from Caller -> Callee requires a change in /// streaming mode. - /// If \p BodyOverridesInterface is true and Callee has a streaming body, - /// then requiresSMChange considers a call to Callee as having a Streaming - /// interface. This can be useful when considering e.g. inlining, where we - /// explicitly want the body to overrule the interface (because after inlining - /// the interface is no longer relevant). - std::optional - requiresSMChange(const SMEAttrs &Callee, - bool BodyOverridesInterface = false) const; + bool requiresSMChange(const SMEAttrs &Callee) const; // Interfaces to query PSTATE.ZA bool hasNewZABody() const { return Bitmask & ZA_New; } diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index 7780c71bbc00..5ac143b52a25 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -102,86 +102,51 @@ TEST(SMEAttributes, Transitions) { ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal))); // Normal -> Normal + LocallyStreaming ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal | SA::SM_Body))); - ASSERT_EQ(*SA(SA::Normal) - .requiresSMChange(SA(SA::Normal | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); // Normal -> Streaming - ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled)), true); + ASSERT_TRUE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled))); // Normal -> Streaming + LocallyStreaming - ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)), - true); - ASSERT_EQ(*SA(SA::Normal) - .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); + ASSERT_TRUE( + SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); // Normal -> Streaming-compatible ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible))); // Normal -> Streaming-compatible + LocallyStreaming ASSERT_FALSE( SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); - ASSERT_EQ(*SA(SA::Normal) - .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); // Streaming -> Normal - ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal)), false); + ASSERT_TRUE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal))); // Streaming -> Normal + LocallyStreaming - ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body)), - false); - ASSERT_FALSE(SA(SA::SM_Enabled) - .requiresSMChange(SA(SA::Normal | SA::SM_Body), - /*BodyOverridesInterface=*/true)); + ASSERT_TRUE( + SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body))); // Streaming -> Streaming ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled))); // Streaming -> Streaming + LocallyStreaming ASSERT_FALSE( SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); - ASSERT_FALSE(SA(SA::SM_Enabled) - .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body), - /*BodyOverridesInterface=*/true)); // Streaming -> Streaming-compatible ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible))); // Streaming -> Streaming-compatible + LocallyStreaming ASSERT_FALSE( SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); - ASSERT_FALSE(SA(SA::SM_Enabled) - .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body), - /*BodyOverridesInterface=*/true)); // Streaming-compatible -> Normal - ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal)), false); - ASSERT_EQ( - *SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body)), - false); - ASSERT_EQ(*SA(SA::SM_Compatible) - .requiresSMChange(SA(SA::Normal | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); + ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal))); + ASSERT_TRUE( + SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body))); // Streaming-compatible -> Streaming - ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled)), true); + ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled))); // Streaming-compatible -> Streaming + LocallyStreaming - ASSERT_EQ( - *SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)), - true); - ASSERT_EQ(*SA(SA::SM_Compatible) - .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); + ASSERT_TRUE( + SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); // Streaming-compatible -> Streaming-compatible ASSERT_FALSE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Compatible))); // Streaming-compatible -> Streaming-compatible + LocallyStreaming ASSERT_FALSE(SA(SA::SM_Compatible) .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); - ASSERT_EQ(*SA(SA::SM_Compatible) - .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body), - /*BodyOverridesInterface=*/true), - true); } -- Gitee From b8ce34f13af6dcd5ce773249a62d25598147b803 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Mon, 22 Jan 2024 16:30:43 +0000 Subject: [PATCH 60/77] [AArch64][SME2] Extend SMEABIPass to handle functions with new ZT0 state (#78848) updateNewZAFunctions is extended to generate the following on entry to a function with either the "aarch64_pstate_za_new" or "arm_new_zt0" attribute: - Private-ZA interface: commit any active lazy-saves & enable PSTATE.ZA. - "aarch64_pstate_za_new": zero ZA. - "arm_new_zt0": zero ZT0. Additionally, PSTATE.ZA should disabled before returning if the function has a private-ZA interface. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- llvm/lib/Target/AArch64/SMEABIPass.cpp | 132 +++++++++++------- .../AArch64/Utils/AArch64SMEAttributes.cpp | 11 +- .../AArch64/Utils/AArch64SMEAttributes.h | 17 ++- 3 files changed, 94 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index c813d92ec85b..15e047e5e7de 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -41,7 +41,8 @@ struct SMEABI : public FunctionPass { bool runOnFunction(Function &F) override; private: - bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder); + bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder, + SMEAttrs FnAttrs); }; } // end anonymous namespace @@ -77,56 +78,87 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { Builder.getInt64(0)); } -/// This function generates code to commit a lazy save at the beginning of a -/// function marked with `aarch64_pstate_za_new`. If the value read from -/// TPIDR2_EL0 is not null on entry to the function then the lazy-saving scheme -/// is active and we should call __arm_tpidr2_save to commit the lazy save. -/// Additionally, PSTATE.ZA should be enabled at the beginning of the function -/// and disabled before returning. -bool SMEABI::updateNewZAFunctions(Module *M, Function *F, - IRBuilder<> &Builder) { +/// This function generates code at the beginning and end of a function marked +/// with either `aarch64_pstate_za_new` or `aarch64_new_zt0`. +/// At the beginning of the function, the following code is generated: +/// - Commit lazy-save if active [Private-ZA Interface*] +/// - Enable PSTATE.ZA [Private-ZA Interface] +/// - Zero ZA [Has New ZA State] +/// - Zero ZT0 [Has New ZT0 State] +/// +/// * A function with new ZT0 state will not change ZA, so committing the +/// lazy-save is not strictly necessary. However, the lazy-save mechanism +/// may be active on entry to the function, with PSTATE.ZA set to 1. If +/// the new ZT0 function calls a function that does not share ZT0, we will +/// need to conditionally SMSTOP ZA before the call, setting PSTATE.ZA to 0. +/// For this reason, it's easier to always commit the lazy-save at the +/// beginning of the function regardless of whether it has ZA state. +/// +/// At the end of the function, PSTATE.ZA is disabled if the function has a +/// Private-ZA Interface. A function is considered to have a Private-ZA +/// interface if it does not share ZA or ZT0. +/// +bool SMEABI::updateNewStateFunctions(Module *M, Function *F, + IRBuilder<> &Builder, SMEAttrs FnAttrs) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); - - // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state. - auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true); - auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB); - - // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0. - Builder.SetInsertPoint(PreludeBB); - Function *TPIDR2Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); - auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr, - {}, "tpidr2"); - auto *Cmp = - Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, Builder.getInt64(0), "cmp"); - Builder.CreateCondBr(Cmp, SaveBB, OrigBB); - - // Create a call __arm_tpidr2_save, which commits the lazy save. - Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder); - - // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); - Function *EnableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); - Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); - - // ZA state must be zeroed upon entry to a function with NewZA - Function *ZeroIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); - Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, - Builder.getInt32(0xff)); - - // Before returning, disable pstate.za - for (BasicBlock &BB : *F) { - Instruction *T = BB.getTerminator(); - if (!T || !isa(T)) - continue; - Builder.SetInsertPoint(T); - Function *DisableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); - Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); + + // Commit any active lazy-saves if this is a Private-ZA function. If the + // value read from TPIDR2_EL0 is not null on entry to the function then + // the lazy-saving scheme is active and we should call __arm_tpidr2_save + // to commit the lazy save. + if (FnAttrs.hasPrivateZAInterface()) { + // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state. + auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true); + auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB); + + // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0. + Builder.SetInsertPoint(PreludeBB); + Function *TPIDR2Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); + auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr, + {}, "tpidr2"); + auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, + Builder.getInt64(0), "cmp"); + Builder.CreateCondBr(Cmp, SaveBB, OrigBB); + + // Create a call __arm_tpidr2_save, which commits the lazy save. + Builder.SetInsertPoint(&SaveBB->back()); + emitTPIDR2Save(M, Builder); + + // Enable pstate.za at the start of the function. + Builder.SetInsertPoint(&OrigBB->front()); + Function *EnableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + } + + if (FnAttrs.hasNewZABody()) { + Function *ZeroIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); + Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, + Builder.getInt32(0xff)); + } + + if (FnAttrs.isNewZT0()) { + Function *ClearZT0Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); + Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, + {Builder.getInt32(0)}); + } + + if (FnAttrs.hasPrivateZAInterface()) { + // Before returning, disable pstate.za + for (BasicBlock &BB : *F) { + Instruction *T = BB.getTerminator(); + if (!T || !isa(T)) + continue; + Builder.SetInsertPoint(T); + Function *DisableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); + Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); + } } F->addFnAttr("aarch64_expanded_pstate_za"); @@ -143,8 +175,8 @@ bool SMEABI::runOnFunction(Function &F) { bool Changed = false; SMEAttrs FnAttrs(F); - if (FnAttrs.hasNewZABody()) - Changed |= updateNewZAFunctions(M, &F, Builder); + if (FnAttrs.hasNewZABody() || FnAttrs.isNewZT0()) + Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); return Changed; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 05d70ab76ede..7cb7b296a163 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -24,10 +24,8 @@ void SMEAttrs::set(unsigned M, bool Enable) { "ZA_New and ZA_Shared are mutually exclusive"); assert(!(hasNewZABody() && preservesZA()) && "ZA_New and ZA_Preserved are mutually exclusive"); - assert(!(hasNewZABody() && (Bitmask & ZA_NoLazySave)) && - "ZA_New and ZA_NoLazySave are mutually exclusive"); - assert(!(hasSharedZAInterface() && (Bitmask & ZA_NoLazySave)) && - "ZA_Shared and ZA_NoLazySave are mutually exclusive"); + assert(!(hasNewZABody() && (Bitmask & SME_ABI_Routine)) && + "ZA_New and SME_ABI_Routine are mutually exclusive"); } SMEAttrs::SMEAttrs(const CallBase &CB) { @@ -39,11 +37,10 @@ SMEAttrs::SMEAttrs(const CallBase &CB) { SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) { if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") - Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved | - SMEAttrs::ZA_NoLazySave); + Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); if (FuncName == "__arm_tpidr2_restore") Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared | - SMEAttrs::ZA_NoLazySave); + SMEAttrs::SME_ABI_Routine); } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 93ea8eedb811..b43014a5248d 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -29,14 +29,13 @@ public: // Enum with bitmasks for each individual SME feature. enum Mask { Normal = 0, - SM_Enabled = 1 << 0, // aarch64_pstate_sm_enabled - SM_Compatible = 1 << 1, // aarch64_pstate_sm_compatible - SM_Body = 1 << 2, // aarch64_pstate_sm_body - ZA_Shared = 1 << 3, // aarch64_pstate_sm_shared - ZA_New = 1 << 4, // aarch64_pstate_sm_new - ZA_Preserved = 1 << 5, // aarch64_pstate_sm_preserved - ZA_NoLazySave = 1 << 6, // Used for SME ABI routines to avoid lazy saves - All = ZA_Preserved - 1 + SM_Enabled = 1 << 0, // aarch64_pstate_sm_enabled + SM_Compatible = 1 << 1, // aarch64_pstate_sm_compatible + SM_Body = 1 << 2, // aarch64_pstate_sm_body + ZA_Shared = 1 << 3, // aarch64_pstate_sm_shared + ZA_New = 1 << 4, // aarch64_pstate_sm_new + ZA_Preserved = 1 << 5, // aarch64_pstate_sm_preserved + SME_ABI_Routine = 1 << 6, // Used for SME ABI routines to avoid lazy saves }; SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); } @@ -77,7 +76,7 @@ public: } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && - !(Callee.Bitmask & ZA_NoLazySave); + !(Callee.Bitmask & SME_ABI_Routine); } }; -- Gitee From 7dc4badf21a07837ea7766555bff7f464608f97e Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Mon, 22 Jan 2024 17:12:16 +0000 Subject: [PATCH 61/77] [AArch64][SME] Take arm_sme.h out of draft (#78961) Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/Headers/CMakeLists.txt | 6 +- .../aarch64-sme-intrinsics/acle_sme_add-i32.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_add-i64.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_cnt.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 2 +- .../acle_sme_ld1_vnum.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 2 +- .../acle_sme_mopa-za32.c | 2 +- .../acle_sme_mopa-za64.c | 2 +- .../acle_sme_mops-za32.c | 2 +- .../acle_sme_mops-za64.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_read.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_st1.c | 2 +- .../acle_sme_st1_vnum.c | 2 +- .../acle_sme_state_funs.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_str.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_write.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_zero.c | 2 +- .../aarch64-sme2-intrinsics/acle_sme2_add.c | 649 ------------------ .../aarch64-sme2-intrinsics/acle_sme2_sub.c | 649 ------------------ .../acle_sme2_vector_add.c | 539 --------------- .../Sema/aarch64-incompat-sm-builtin-calls.c | 2 +- .../aarch64-sme-intrinsics/acle_sme_imm.cpp | 2 +- .../aarch64-sme-intrinsics/acle_sme_target.c | 2 +- clang/utils/TableGen/SveEmitter.cpp | 4 +- .../gn/secondary/clang/lib/Headers/BUILD.gn | 8 +- .../llvm-project-overlay/clang/BUILD.bazel | 6 +- 27 files changed, 32 insertions(+), 1869 deletions(-) delete mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c delete mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c delete mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 356009ae9157..0477cad066b6 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -362,8 +362,8 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD) clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) # Generate arm_sve.h clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h) - # Generate arm_sme_draft_spec_subject_to_change.h - clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme_draft_spec_subject_to_change.h) + # Generate arm_sme.h + clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h) # Generate arm_bf16.h clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h) # Generate arm_mve.h @@ -384,7 +384,7 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD) list(APPEND aarch64_only_generated_files "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h" - "${CMAKE_CURRENT_BINARY_DIR}/arm_sme_draft_spec_subject_to_change.h" + "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h" "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h" ) endif() diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index 08301009df47..89aef63bc34b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index b8836bec6400..a5e04ff9071c 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c index b3b2499a3830..5e5f3108bccd 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svcntsb( // CHECK-CXX-LABEL: @_Z12test_svcntsbv( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index e3c727941ccc..37812bba4969 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svld1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 34191bf799f3..29357055bb97 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svld1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 5f5b40a5ccf9..627f770d3ce4 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svldr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index 08f945001d07..e6f8fbefa34f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index 42f09516a74f..c55ff1cc5e99 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index 5629c59d66d6..aa853e43dac8 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index c1abc3a30079..e8a54c1fbb54 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index 6748f8602206..f11e1a87cf33 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index b418f21c5cf8..01681eab04b5 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svst1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index d346ec346e61..9c21c7b14a56 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svst1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c index c3e4967bfe9b..dc07efbb8160 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -4,7 +4,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -#include +#include // CHECK-LABEL: @test_in_streaming_mode( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 23dcd9c12a8d..0133a3d9a1cb 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svstr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index f4ac53291400..64e7c10e3022 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index 0c157af1cdc9..ba1cf9ae991e 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include // CHECK-C-LABEL: @test_svzero_mask_za( // CHECK-CXX-LABEL: @_Z19test_svzero_mask_zav( diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c deleted file mode 100644 index 4249e9c6933a..000000000000 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c +++ /dev/null @@ -1,649 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py - -// REQUIRES: aarch64-registered-target - -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 -#endif - -// -// Single-Multi -// - -// x2 -// CHECK-LABEL: @test_svadd_write_single2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); -} - -// x4 - -// CHECK-LABEL: @test_svadd_write_single4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_single4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); -} - -// -// Multi-Multi -// - -// x2 - -// CHECK-LABEL: @test_svadd_write_multi2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); -} - -// x4 - -// CHECK-LABEL: @test_svadd_write_multi4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svadd_write_multi4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); -} - -// -// Accumulate to ZA -// - -// x2 - -// CHECK-LABEL: @test_svadd_za32_vg1x2_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za32_vg1x2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn); -} - -// CHECK-LABEL: @test_svadd_za32_vg1x2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x2_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn); -} - -// x4 - -// CHECK-LABEL: @test_svadd_za32_vg1x4_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za32_vg1x4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za32_vg1x4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x4_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svadd_za64_vg1x4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn); -} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c deleted file mode 100644 index 3f0a36db313b..000000000000 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c +++ /dev/null @@ -1,649 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py - -// REQUIRES: aarch64-registered-target - -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 -#endif - -// -// Single-Multi -// - -// x2 -// CHECK-LABEL: @test_svsub_write_single2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s32j11svint32x2_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_s64j11svint64x2_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm); -} - -// x4 - -// CHECK-LABEL: @test_svsub_write_single4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s32j11svint32x4_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_s64j11svint64x4_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_single4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm); -} - -// -// Multi-Multi -// - -// x2 - -// CHECK-LABEL: @test_svsub_write_multi2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s32j11svint32x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_s64j11svint64x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm); -} - -// x4 - -// CHECK-LABEL: @test_svsub_write_multi4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s32j11svint32x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_s64j11svint64x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm); -} - -// CHECK-LABEL: @test_svsub_write_multi4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm); -} - -// -// Accumulate to ZA -// - -// x2 - -// CHECK-LABEL: @test_svsub_za32_vg1x2_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za32_vg1x2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_s32j11svint32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn); -} - -// CHECK-LABEL: @test_svsub_za32_vg1x2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x2_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_s64j11svint64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn); -} - -// x4 - -// CHECK-LABEL: @test_svsub_za32_vg1x4_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za32_vg1x4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_s32j11svint32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za32_vg1x4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x4_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_s64j11svint64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn); -} - -// CHECK-LABEL: @test_svsub_za64_vg1x4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CHECK-NEXT: ret void -// -// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) -// CPP-CHECK-NEXT: ret void -// -void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn); -} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c deleted file mode 100644 index 85c4b9b09546..000000000000 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c +++ /dev/null @@ -1,539 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py - -// REQUIRES: aarch64-registered-target - -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 -#endif - -// -// Multi-Single Vector -// - -// x2 - -// CHECK-LABEL: @test_svadd_vector_single2_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single2_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) -// CHECK-NEXT: ret [[TMP6]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) -// CPP-CHECK-NEXT: ret [[TMP6]] -// -svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm); -} - - -// x4 - -// CHECK-LABEL: @test_svadd_vector_single4_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm); -} - -// CHECK-LABEL: @test_svadd_vector_single4_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) -// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) -// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) -// CHECK-NEXT: ret [[TMP12]] -// -// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 -// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) -// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 -// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) -// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 -// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) -// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 -// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) -// CPP-CHECK-NEXT: ret [[TMP12]] -// -svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) __arm_streaming { - return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm); -} diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 9e4b3920e543..079cff5a5bba 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -5,7 +5,7 @@ // REQUIRES: aarch64-registered-target #include "arm_neon.h" -#include "arm_sme_draft_spec_subject_to_change.h" +#include "arm_sme.h" #include "arm_sve.h" int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming { diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp index 40254a5a0eaf..8f3f98394492 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -10,7 +10,7 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -#include +#include void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c index f1e858f81960..6af115beba8e 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -3,7 +3,7 @@ // Test that functions with the correct target attributes can use the correct SME intrinsics. -#include +#include __attribute__((target("sme"))) void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") { diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index ab87d91625b9..2b61f8b81587 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1480,7 +1480,7 @@ void SVEEmitter::createTypeFlags(raw_ostream &OS) { } void SVEEmitter::createSMEHeader(raw_ostream &OS) { - OS << "/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics " + OS << "/*===---- arm_sme.h - ARM SME intrinsics " "------===\n" " *\n" " *\n" @@ -1497,7 +1497,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "#define __ARM_SME_H\n\n"; OS << "#if !defined(__LITTLE_ENDIAN__)\n"; - OS << "#error \"Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h\"\n"; + OS << "#error \"Big endian is currently not supported for arm_sme.h\"\n"; OS << "#endif\n"; OS << "#include \n\n"; diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index bad86c2c984a..e465ae7e00f3 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -22,11 +22,11 @@ clang_tablegen("arm_sve") { output_name = "arm_sve.h" } -# Generate arm_sme_draft_spec_subject_to_change.h -clang_tablegen("arm_sme_draft_spec_subject_to_change") { +# Generate arm_sme.h +clang_tablegen("arm_sme") { args = [ "-gen-arm-sme-header" ] td_file = "//clang/include/clang/Basic/arm_sme.td" - output_name = "arm_sme_draft_spec_subject_to_change.h" + output_name = "arm_sme.h" } # Generate arm_bf16.h @@ -65,7 +65,7 @@ copy("tablegen_headers") { ":arm_fp16", ":arm_mve", ":arm_neon", - ":arm_sme_draft_spec_subject_to_change", + ":arm_sme", ":arm_sve", ":riscv_vector", ] diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 037719a51dd1..56a4a72fece6 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1625,14 +1625,14 @@ gentbl( ) gentbl( - name = "headers_arm_sme_draft_spec_subject_to_change_gen", + name = "headers_arm_sme_gen", copts = [ "-Wno-implicit-fallthrough", "-Wno-error=frame-larger-than=", ], tbl_outs = [( "-gen-arm-sme-header", - "lib/Headers/arm_sme_draft_spec_subject_to_change.h", + "lib/Headers/arm_sme.h", )], tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", @@ -1673,7 +1673,7 @@ builtin_headers = glob( "lib/Headers/arm_mve.h", "lib/Headers/arm_neon.h", "lib/Headers/arm_sve.h", - "lib/Headers/arm_sme_draft_spec_subject_to_change.h", + "lib/Headers/arm_sme.h", "lib/Headers/arm_bf16.h", "lib/Headers/module.modulemap", "lib/Headers/riscv_vector.h", -- Gitee From 55e6ba4e7f06886b9fc38033e5c0f32069fe0283 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 31 Jan 2024 09:04:13 +0000 Subject: [PATCH 62/77] [SME] Stop RA from coalescing COPY instructions that transcend beyond smstart/smstop. (#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 6 + .../Target/AArch64/AArch64ISelLowering.cpp | 24 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +- .../Target/AArch64/AArch64RegisterInfo.cpp | 35 + .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 22 + .../AArch64/sme-disable-gisel-fisel.ll | 35 +- ...ate-sm-changing-call-disable-coalescing.ll | 1640 +++++++++++++++++ .../CodeGen/AArch64/sme-streaming-body.ll | 4 + .../sme-streaming-compatible-interface.ll | 29 +- .../AArch64/sme-streaming-interface.ll | 12 +- ...nging-call-disable-stackslot-scavenging.ll | 2 +- 11 files changed, 1777 insertions(+), 36 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 8e1590733615..fe392ddcf057 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1483,6 +1483,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + MI.eraseFromParent(); + return true; } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 875cedee1881..79ce886394ce 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2277,6 +2277,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) @@ -6868,13 +6869,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, } } +static bool isPassedInFPR(EVT VT) { + return VT.isFixedLengthVector() || + (VT.isFloatingPoint() && !VT.isScalableVector()); +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const { + SDValue ThisVal, bool RequiresSMChange) const { DenseMap CopiedRegs; // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -6919,6 +6925,10 @@ SDValue AArch64TargetLowering::LowerCallResult( break; } + if (RequiresSMChange && isPassedInFPR(VA.getValVT())) + Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(), + Val); + InVals.push_back(Val); } @@ -7596,6 +7606,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return ArgReg.Reg == VA.getLocReg(); }); } else { + // Add an extra level of indirection for streaming mode changes by + // using a pseudo copy node that cannot be rematerialised between a + // smstart/smstop and the call by the simple register coalescer. + if (RequiresSMChange && isPassedInFPR(Arg.getValueType())) + Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, + Arg.getValueType(), Arg); RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; @@ -7829,9 +7845,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, - DL, DAG, InVals, IsThisReturn, - IsThisReturn ? OutVals[0] : SDValue()); + SDValue Result = LowerCallResult( + Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange); if (!Ins.empty()) InGlue = Result.getValue(Result->getNumValues() - 1); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 060c939f7017..2f6dc303934d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,6 +58,8 @@ enum NodeType : unsigned { CALL_BTI, // Function call followed by a BTI instruction. + COALESCER_BARRIER, + SMSTART, SMSTOP, RESTORE_ZA, @@ -971,7 +973,7 @@ private: const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const; + SDValue ThisVal, bool RequiresSMChange) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index d1ddf6d76975..7372dbf7ec86 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -987,6 +987,8 @@ bool AArch64RegisterInfo::shouldCoalesce( MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { + MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + if (MI->isCopy() && ((DstRC->getID() == AArch64::GPR64RegClassID) || (DstRC->getID() == AArch64::GPR64commonRegClassID)) && @@ -995,5 +997,38 @@ bool AArch64RegisterInfo::shouldCoalesce( // which implements a 32 to 64 bit zero extension // which relies on the upper 32 bits being zeroed. return false; + + auto IsCoalescerBarrier = [](const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + return true; + default: + return false; + } + }; + + // For calls that temporarily have to toggle streaming mode as part of the + // call-sequence, we need to be more careful when coalescing copy instructions + // so that we don't end up coalescing the NEON/FP result or argument register + // with a whole Z-register, such that after coalescing the register allocator + // will try to spill/reload the entire Z register. + // + // We do this by checking if the node has any defs/uses that are + // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to + // instruct the coalescer to avoid coalescing the copy. + if (MI->isCopy() && SubReg != DstSubReg && + (AArch64::ZPRRegClass.hasSubClassEq(DstRC) || + AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) { + unsigned SrcReg = MI->getOperand(1).getReg(); + if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier)) + return false; + } + return true; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index daaf040656f3..234e1bdfcc2e 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -22,6 +22,8 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPHasChain, SDNPSideEffect, SDNPVariadic, SDNPOptInGlue]>; +def AArch64CoalescerBarrier + : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>; //===----------------------------------------------------------------------===// // Instruction naming conventions. @@ -183,6 +185,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), (MSR 0xde85, GPR64:$val)>; def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; + +multiclass CoalescerBarrierPseudo vts> { + def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> { + let Constraints = "$dst = $src"; + } + foreach vt = vts in { + def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))), + (!cast(NAME) rc:$src)>; + } +} + +multiclass CoalescerBarriers { + defm _FPR16 : CoalescerBarrierPseudo; + defm _FPR32 : CoalescerBarrierPseudo; + defm _FPR64 : CoalescerBarrierPseudo; + defm _FPR128 : CoalescerBarrierPseudo; +} + +defm COALESCER_BARRIER : CoalescerBarriers; + } // End let Predicates = [HasSME] // Pseudo to match to smstart/smstop. This expands: diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index e9be9c785a19..bbd96117d85e 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -23,9 +23,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-FISEL-NEXT: bl streaming_callee ; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstop sm +; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0 ; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: fadd d0, d1, d0 ; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -49,9 +49,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline ; CHECK-GISEL-NEXT: bl streaming_callee ; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstop sm -; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 -; CHECK-GISEL-NEXT: fmov d0, x8 ; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-GISEL-NEXT: fmov d0, x8 ; CHECK-GISEL-NEXT: fadd d0, d1, d0 ; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -82,9 +82,9 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: str d0, [sp, #88] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d0, x8 -; CHECK-COMMON-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: fadd d0, d1, d0 ; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -110,14 +110,16 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm -; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 -; CHECK-COMMON-NEXT: fmov d0, x8 ; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-COMMON-NEXT: fmov d0, x8 ; CHECK-COMMON-NEXT: fadd d0, d1, d0 ; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm @@ -319,9 +321,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm @@ -374,14 +376,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nou define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind { ; CHECK-COMMON-LABEL: frem_call_sm: ; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: stp s0, s1, [sp, #72] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #72] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf ; CHECK-COMMON-NEXT: str s0, [sp, #76] // 4-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm @@ -408,7 +411,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: bl __arm_sme_state +; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: and x19, x0, #0x1 +; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 ; CHECK-COMMON-NEXT: // %bb.1: ; CHECK-COMMON-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll new file mode 100644 index 000000000000..d5bea725b6d1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -0,0 +1,1640 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-unknown-eabi-elf" + +; This test verifies that call arguments and results are not coalesced +; with SVE vector registers by the coalescer, such that no 'mul vl' +; ldr/str pairs are generated in the streaming-mode-changing call +; sequence. + +; +; Scalar arguments +; + +define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i8 %arg, i32 0 + call void @use_i8(i8 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i16 %arg, i32 0 + call void @use_i16(i16 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i32 %arg, i32 0 + call void @use_i32(i32 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl use_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i64 %arg, i32 0 + call void @use_i64(i64 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, half %arg, i32 0 + call void @use_f16(half %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, float %arg, i32 0 + call void @use_f32(float %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, double %arg, i32 0 + call void @use_f64(double %arg) + store %vec, ptr %ptr + ret void +} + + +; +; Single-element vector arguments +; + +define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i8> %arg, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + call void @use_v16i8(<1 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i16> %arg, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + call void @use_v8i16(<1 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i32> %arg, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + call void @use_v4i32(<1 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i64> %arg, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + call void @use_v2i64(<1 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x half> %arg, i32 0 + %vec = insertelement poison, half %elt, i32 0 + call void @use_v8f16(<1 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x float> %arg, i32 0 + %vec = insertelement poison, float %elt, i32 0 + call void @use_v4f32(<1 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x double> %arg, i32 0 + %vec = insertelement poison, double %elt, i32 0 + call void @use_v2f64(<1 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Full vector arguments +; + +define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %arg, i64 0) + call void @use_v16i8(<16 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %arg, i64 0) + call void @use_v8i16(<8 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %arg, i64 0) + call void @use_v4i32(<4 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %arg, i64 0) + call void @use_v2i64(<2 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %arg, i64 0) + call void @use_v8f16(<8 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8bf16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8bf16.v8bf16( poison, <8 x bfloat> %arg, i64 0) + call void @use_v8bf16(<8 x bfloat> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %arg, i64 0) + call void @use_v4f32(<4 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %arg, i64 0) + call void @use_v2f64(<2 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; <8 x i1> type will need type promotion. +; +define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: and z1.b, z1.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v8i1 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr p0, [x8, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: str p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8i1.v8i1( poison, <8 x i1> %arg, i64 0) + call void @use_v8i1(<8 x i1> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Scalar return values +; + +define void @dont_coalesce_res_i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i8 @get_i8() + %vec = insertelement poison, i8 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i16 @get_i16() + %vec = insertelement poison, i16 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i32 @get_i32() + %vec = insertelement poison, i32 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @get_i64() + %vec = insertelement poison, i64 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call half @get_f16() + %vec = insertelement poison, half %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f32 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call float @get_f32() + %vec = insertelement poison, float %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call double @get_f64() + %vec = insertelement poison, double %res, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Single-element vector result values +; + +define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i8 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i8> @get_v1i8() + %elt = extractelement <1 x i8> %res, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i16 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i16> @get_v1i16() + %elt = extractelement <1 x i16> %res, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i32> @get_v1i32() + %elt = extractelement <1 x i32> %res, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i64> @get_v1i64() + %elt = extractelement <1 x i64> %res, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x half> @get_v1f16() + %elt = extractelement <1 x half> %res, i32 0 + %vec = insertelement poison, half %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x float> @get_v1f32() + %elt = extractelement <1 x float> %res, i32 0 + %vec = insertelement poison, float %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x double> @get_v1f64() + %elt = extractelement <1 x double> %res, i32 0 + %vec = insertelement poison, double %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Full vector result values +; + +define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v16i8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <16 x i8> @get_v16i8() + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8i16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x i16> @get_v8i16() + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4i32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x i32> @get_v4i32() + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2i64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x i64> @get_v2i64() + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8f16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x half> @get_v8f16() + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4f32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x float> @get_v4f32() + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2f64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x double> @get_v2f64() + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +declare half @get_f16() +declare float @get_f32() +declare double @get_f64() +declare <1 x half> @get_v1f16() +declare <1 x float> @get_v1f32() +declare <1 x double> @get_v1f64() +declare <8 x half> @get_v8f16() +declare <4 x float> @get_v4f32() +declare <2 x double> @get_v2f64() + +declare i8 @get_i8() +declare i16 @get_i16() +declare i32 @get_i32() +declare i64 @get_i64() +declare <1 x i8> @get_v1i8() +declare <1 x i16> @get_v1i16() +declare <1 x i32> @get_v1i32() +declare <2 x i64> @get_v1i64() +declare <16 x i8> @get_v16i8() +declare <8 x i16> @get_v8i16() +declare <4 x i32> @get_v4i32() +declare <2 x i64> @get_v2i64() + +declare void @use_f16(half) +declare void @use_f32(float) +declare void @use_f64(double) +declare void @use_v1f16(<1 x half>) +declare void @use_v1f32(<1 x float>) +declare void @use_v1f64(<1 x double>) +declare void @use_v8f16(<8 x half>) +declare void @use_v8bf16(<8 x bfloat>) +declare void @use_v4f32(<4 x float>) +declare void @use_v2f64(<2 x double>) + +declare void @use_i8(i8) +declare void @use_i16(i16) +declare void @use_i32(i32) +declare void @use_i64(i64) +declare void @use_v1i8(<1 x i8>) +declare void @use_v1i16(<1 x i16>) +declare void @use_v1i32(<1 x i32>) +declare void @use_v1i64(<1 x i64>) +declare void @use_v16i8(<16 x i8>) +declare void @use_v8i16(<8 x i16>) +declare void @use_v4i32(<4 x i32>) +declare void @use_v2i64(<2 x i64>) +declare void @use_v8i1(<8 x i1>) + +declare @llvm.vector.insert.nxv8i1.v8i1(, <8 x i1>, i64) +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) + +attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 5e4cdc46843d..91accad7f2fd 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -245,11 +245,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl cos ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 7d7f6af8a641..6165ccb14aae 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,30 +129,37 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl normal_callee_vec_arg -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -462,7 +469,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: ldp s4, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: stp s4, s0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: ldp d4, d0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: stp d4, d0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 9de45c649caf..556c461f573f 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -313,9 +313,9 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-NEXT: bl cos ; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm @@ -405,11 +405,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: stp s1, s0, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp d3, d2, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp s1, s0, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp d3, d2, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: bl bar ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index e4cd4d6c05c5..f56484e82157 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -22,9 +22,9 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f -- Gitee From f89586e83b785d91236681ac0cc99a8a02bcdd53 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 31 Jan 2024 11:38:29 +0000 Subject: [PATCH 63/77] [AArch64][SME] Fix inlining bug introduced in #78703 (#79994) Calling a `__arm_locally_streaming` function from a function that is not a streaming-SVE function would lead to incorrect inlining. The issue didn't surface because the tests were not testing what they were supposed to test. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../AArch64/AArch64TargetTransformInfo.cpp | 15 +- .../Inline/AArch64/sme-pstatesm-attrs.ll | 369 +++++++++--------- 2 files changed, 194 insertions(+), 190 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2aaf2194ccaf..2db3bb7b8cd8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -221,15 +221,20 @@ static bool hasPossibleIncompatibleOps(const Function *F) { bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { - SMEAttrs CallerAttrs(*Caller); - SMEAttrs CalleeAttrs(*Callee); + SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); + + // When inlining, we should consider the body of the function, not the + // interface. + if (CalleeAttrs.hasStreamingBody()) { + CalleeAttrs.set(SMEAttrs::SM_Compatible, false); + CalleeAttrs.set(SMEAttrs::SM_Enabled, true); + } + if (CalleeAttrs.hasNewZABody()) return false; if (CallerAttrs.requiresLazySave(CalleeAttrs) || - (CallerAttrs.requiresSMChange(CalleeAttrs) && - (!CallerAttrs.hasStreamingInterfaceOrBody() || - !CalleeAttrs.hasStreamingBody()))) { + CallerAttrs.requiresSMChange(CalleeAttrs)) { if (hasPossibleIncompatibleOps(Callee)) return false; } diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll index f2f5768dbe9c..3aca46fea04c 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll @@ -1,71 +1,70 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s -declare void @inlined_body() "aarch64_pstate_sm_compatible"; +declare i32 @llvm.vscale.i32() -; Define some functions that will be called by the functions below. -; These just call a '...body()' function. If we see the call to one of -; these functions being replaced by '...body()', then we know it has been -; inlined. +; Define some functions that merely call llvm.vscale.i32(), which will be called +; by the other functions below. If we see the call to one of these functions +; being replaced by 'llvm.vscale()', then we know it has been inlined. -define void @normal_callee() { -; CHECK-LABEL: define void @normal_callee +define i32 @normal_callee() { +; CHECK-LABEL: define i32 @normal_callee ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_callee() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_callee +define i32 @streaming_callee() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_callee ; CHECK-SAME: () #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @locally_streaming_callee() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_callee +define i32 @locally_streaming_callee() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_callee ; CHECK-SAME: () #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_callee +define i32 @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_callee ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale.i32() + ret i32 %res } -define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_callee +define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee ; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @inlined_body() - ret void + %res = call i32 @llvm.vscale() + ret i32 %res } ; Now test that inlining only happens when their streaming modes match. @@ -85,16 +84,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_normal_callee_inline() { -; CHECK-LABEL: define void @normal_caller_normal_callee_inline +define i32 @normal_caller_normal_callee_inline() { +; CHECK-LABEL: define i32 @normal_caller_normal_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] N -> N @@ -102,16 +101,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_callee_inline +define i32 @normal_caller_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] N -> N @@ -119,16 +118,16 @@ entry: ; [x] N -> SC ; [ ] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_streaming_compatible_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_compatible_callee_inline +define i32 @normal_caller_streaming_compatible_callee_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] N -> N @@ -136,16 +135,16 @@ entry: ; [ ] N -> SC ; [x] N -> N + B ; [ ] N -> SC + B -define void @normal_caller_locally_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_inline +define i32 @normal_caller_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] N -> N @@ -153,16 +152,16 @@ entry: ; [ ] N -> SC ; [ ] N -> N + B ; [x] N -> SC + B -define void @normal_caller_streaming_compatible_locally_streaming_callee_inline() { -; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] S -> N @@ -170,16 +169,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_normal_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_normal_callee_inline +define i32 @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] S -> N @@ -187,16 +186,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_callee_inline +define i32 @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] S -> N @@ -204,16 +203,16 @@ entry: ; [x] S -> SC ; [ ] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_compatible_callee_inline +define i32 @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] S -> N @@ -221,16 +220,16 @@ entry: ; [ ] S -> SC ; [x] S -> N + B ; [ ] S -> SC + B -define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_locally_streaming_callee_inline +define i32 @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] S -> N @@ -238,16 +237,16 @@ entry: ; [ ] S -> SC ; [ ] S -> N + B ; [x] S -> SC + B -define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { -; CHECK-LABEL: define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] N + B -> N @@ -255,16 +254,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_inline +define i32 @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] N + B -> N @@ -272,16 +271,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_callee_inline +define i32 @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] N + B -> N @@ -289,16 +288,16 @@ entry: ; [x] N + B -> SC ; [ ] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_callee_inline +define i32 @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] N + B -> N @@ -306,16 +305,16 @@ entry: ; [ ] N + B -> SC ; [x] N + B -> N + B ; [ ] N + B -> SC + B -define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_locally_streaming_callee_inline +define i32 @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] N + B -> N @@ -323,16 +322,16 @@ entry: ; [ ] N + B -> SC ; [ ] N + B -> N + B ; [x] N + B -> SC + B -define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] SC -> N @@ -340,16 +339,16 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_inline +define i32 @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] SC -> N @@ -357,16 +356,16 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_inline +define i32 @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] SC -> N @@ -374,16 +373,16 @@ entry: ; [x] SC -> SC ; [ ] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_callee_inline +define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] SC -> N @@ -391,16 +390,16 @@ entry: ; [ ] SC -> SC ; [x] SC -> N + B ; [ ] SC -> SC + B -define void @streaming_compatible_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_inline +define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] SC -> N @@ -408,32 +407,32 @@ entry: ; [ ] SC -> SC ; [ ] SC -> N + B ; [x] SC -> SC + B -define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" { -; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline +define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } ; [x] SC + B -> N ; [ ] SC + B -> S ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES:%.*]] = call i32 @normal_callee() +; CHECK-NEXT: ret i32 [[RES]] ; entry: - call void @normal_callee() - ret void + %res = call i32 @normal_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -441,16 +440,16 @@ entry: ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_callee() - ret void + %res = call i32 @streaming_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -458,16 +457,16 @@ entry: ; [x] SC + B -> SC ; [ ] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_callee() - ret void + %res = call i32 @streaming_compatible_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -475,16 +474,16 @@ entry: ; [ ] SC + B -> SC ; [x] SC + B -> N + B ; [ ] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @locally_streaming_callee() - ret void + %res = call i32 @locally_streaming_callee() + ret i32 %res } ; [ ] SC + B -> N @@ -492,16 +491,16 @@ entry: ; [ ] SC + B -> SC ; [ ] SC + B -> N + B ; [x] SC + B -> SC + B -define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { -; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_and_callee_inline +define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @inlined_body() -; CHECK-NEXT: ret void +; CHECK-NEXT: [[RES_I:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i32 [[RES_I]] ; entry: - call void @streaming_compatible_locally_streaming_callee() - ret void + %res = call i32 @streaming_compatible_locally_streaming_callee() + ret i32 %res } define void @normal_callee_with_inlineasm() { -- Gitee From 86646372e92aaa297612e767b268b0058091c92a Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 1 Feb 2024 13:37:37 +0000 Subject: [PATCH 64/77] [AArch64] Replace LLVM IR function attributes for PSTATE.ZA. (#79166) Since https://github.com/ARM-software/acle/pull/276 the ACLE defines attributes to better describe the use of a given SME state. Previously the attributes merely described the possibility of it being 'shared' or 'preserved', whereas the new attributes have more semantics and also describe how the data flows through the program. For ZT0 we already had to add new LLVM IR attributes: * aarch64_new_zt0 * aarch64_in_zt0 * aarch64_out_zt0 * aarch64_inout_zt0 * aarch64_preserves_zt0 We have now done the same for ZA, such that we add: * aarch64_new_za (previously `aarch64_pstate_za_new`) * aarch64_in_za (more specific variation of `aarch64_pstate_za_shared`) * aarch64_out_za (more specific variation of `aarch64_pstate_za_shared`) * aarch64_inout_za (more specific variation of `aarch64_pstate_za_shared`) * aarch64_preserves_za (previously `aarch64_pstate_za_shared, aarch64_pstate_za_preserved`) This explicitly removes 'pstate' from the name, because with SME2 and the new ACLE attributes there is a difference between "sharing ZA" (sharing the ZA matrix register with the caller) and "sharing PSTATE.ZA" (sharing either the ZA or ZT0 register, both part of PSTATE.ZA with the caller). Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/CodeGen/CGBuiltin.cpp | 6 +- clang/lib/CodeGen/CGCall.cpp | 16 +-- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- .../aarch64-sme-attrs.cpp | 18 +-- .../aarch64-sme-intrinsics/acle_sme_zero.c | 16 ++- clang/test/Modules/aarch64-sme-keywords.cppm | 10 +- llvm/docs/AArch64SME.rst | 37 ++++--- llvm/lib/IR/Verifier.cpp | 18 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/Target/AArch64/SMEABIPass.cpp | 12 +- .../AArch64/Utils/AArch64SMEAttributes.cpp | 34 +++--- .../AArch64/Utils/AArch64SMEAttributes.h | 45 ++++++-- .../AArch64/sme-disable-gisel-fisel.ll | 10 +- .../AArch64/sme-lazy-save-call-remarks.ll | 6 +- .../CodeGen/AArch64/sme-lazy-save-call.ll | 8 +- .../CodeGen/AArch64/sme-new-za-function.ll | 8 +- .../AArch64/sme-shared-za-interface.ll | 4 +- .../Inline/AArch64/sme-pstateza-attrs.ll | 22 ++-- llvm/test/Verifier/sme-attributes.ll | 62 ++++++++++- .../Target/AArch64/SMEAttributesTest.cpp | 103 +++++++++++++----- 20 files changed, 287 insertions(+), 152 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e32feb378770..eff67c6ff7cc 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10068,10 +10068,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, false), "__arm_sme_state")); - auto Attrs = - AttributeList() - .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible") - .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved"); + auto Attrs = AttributeList().addFnAttribute(getLLVMContext(), + "aarch64_pstate_sm_compatible"); CI->setAttributes(Attrs); CI->setCallingConv( llvm::CallingConv:: diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index b6c7e7e576b9..bb36dcb42aa7 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1770,14 +1770,14 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx, FuncAttrs.addAttribute("aarch64_pstate_sm_compatible"); // ZA - if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out || - FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut) - FuncAttrs.addAttribute("aarch64_pstate_za_shared"); - if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves || - FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In) { - FuncAttrs.addAttribute("aarch64_pstate_za_shared"); - FuncAttrs.addAttribute("aarch64_pstate_za_preserved"); - } + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves) + FuncAttrs.addAttribute("aarch64_preserves_za"); + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In) + FuncAttrs.addAttribute("aarch64_in_za"); + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out) + FuncAttrs.addAttribute("aarch64_out_za"); + if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut) + FuncAttrs.addAttribute("aarch64_inout_za"); } static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 09b3a6b484e8..e353a649f951 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2295,7 +2295,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, if (auto *Attr = D->getAttr()) { if (Attr->isNewZA()) - B.addAttribute("aarch64_pstate_za_new"); + B.addAttribute("aarch64_new_za"); } // Track whether we need to add the optnone LLVM attribute, diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp index f69703a8a7d8..fdd2de11365d 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -284,20 +284,20 @@ int test_variadic_template() __arm_inout("za") { // CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } -// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" } -// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } +// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_inout_za" } +// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_preserves_za" } // CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind } // CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" } // CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" } -// CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_pstate_za_shared" } -// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } +// CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_inout_za" } +// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_preserves_za" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index ba1cf9ae991e..2c908b5611dd 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -35,11 +35,17 @@ void test_svzero_mask_za_2(void) __arm_inout("za") { svzero_mask_za(255); } -// CHECK-C-LABEL: @test_svzero_za( -// CHECK-CXX-LABEL: @_Z14test_svzero_zav( -// CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) -// CHECK-NEXT: ret void +// CHECK-C-LABEL: define dso_local void @test_svzero_za( +// CHECK-C-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-C-NEXT: entry: +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CHECK-C-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z14test_svzero_zav( +// CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CHECK-CXX-NEXT: ret void // void test_svzero_za(void) __arm_out("za") { svzero_za(); diff --git a/clang/test/Modules/aarch64-sme-keywords.cppm b/clang/test/Modules/aarch64-sme-keywords.cppm index df4dd32b16cf..759701a633ce 100644 --- a/clang/test/Modules/aarch64-sme-keywords.cppm +++ b/clang/test/Modules/aarch64-sme-keywords.cppm @@ -43,14 +43,14 @@ import A; // // CHECK:declare void @_ZW1A22f_streaming_compatiblev() #[[STREAMING_COMPATIBLE_DECL:[0-9]+]] // -// CHECK-DAG: attributes #[[SHARED_ZA_DEF]] = {{{.*}} "aarch64_pstate_za_shared" {{.*}}} -// CHECK-DAG: attributes #[[SHARED_ZA_DECL]] = {{{.*}} "aarch64_pstate_za_shared" {{.*}}} -// CHECK-DAG: attributes #[[PRESERVES_ZA_DECL]] = {{{.*}} "aarch64_pstate_za_preserved" {{.*}}} +// CHECK-DAG: attributes #[[SHARED_ZA_DEF]] = {{{.*}} "aarch64_inout_za" {{.*}}} +// CHECK-DAG: attributes #[[SHARED_ZA_DECL]] = {{{.*}} "aarch64_inout_za" {{.*}}} +// CHECK-DAG: attributes #[[PRESERVES_ZA_DECL]] = {{{.*}} "aarch64_preserves_za" {{.*}}} // CHECK-DAG: attributes #[[NORMAL_DEF]] = {{{.*}}} // CHECK-DAG: attributes #[[STREAMING_DECL]] = {{{.*}} "aarch64_pstate_sm_enabled" {{.*}}} // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_DECL]] = {{{.*}} "aarch64_pstate_sm_compatible" {{.*}}} -// CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_pstate_za_shared" } -// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" } +// CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_inout_za" } +// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_preserves_za" } // CHECK-DAG: attributes #[[STREAMING_USE]] = { "aarch64_pstate_sm_enabled" } // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_USE]] = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/docs/AArch64SME.rst b/llvm/docs/AArch64SME.rst index 63573bf91eac..b5a01cb204b8 100644 --- a/llvm/docs/AArch64SME.rst +++ b/llvm/docs/AArch64SME.rst @@ -22,26 +22,32 @@ Below we describe the LLVM IR attributes and their relation to the C/C++ level ACLE attributes: ``aarch64_pstate_sm_enabled`` - is used for functions with ``__attribute__((arm_streaming))`` + is used for functions with ``__arm_streaming`` ``aarch64_pstate_sm_compatible`` - is used for functions with ``__attribute__((arm_streaming_compatible))`` + is used for functions with ``__arm_streaming_compatible`` ``aarch64_pstate_sm_body`` - is used for functions with ``__attribute__((arm_locally_streaming))`` and is + is used for functions with ``__arm_locally_streaming`` and is only valid on function definitions (not declarations) -``aarch64_pstate_za_new`` - is used for functions with ``__attribute__((arm_new_za))`` +``aarch64_new_za`` + is used for functions with ``__arm_new("za")`` -``aarch64_pstate_za_shared`` - is used for functions with ``__attribute__((arm_shared_za))`` +``aarch64_in_za`` + is used for functions with ``__arm_in("za")`` -``aarch64_pstate_za_preserved`` - is used for functions with ``__attribute__((arm_preserves_za))`` +``aarch64_out_za`` + is used for functions with ``__arm_out("za")`` + +``aarch64_inout_za`` + is used for functions with ``__arm_inout("za")`` + +``aarch64_preserves_za`` + is used for functions with ``__arm_preserves("za")`` ``aarch64_expanded_pstate_za`` - is used for functions with ``__attribute__((arm_new_za))`` + is used for functions with ``__arm_new_za`` Clang must ensure that the above attributes are added both to the function's declaration/definition as well as to their call-sites. This is @@ -89,11 +95,10 @@ Restrictions on attributes * It is not allowed for a function to be decorated with both ``aarch64_pstate_sm_compatible`` and ``aarch64_pstate_sm_enabled``. -* It is not allowed for a function to be decorated with both - ``aarch64_pstate_za_new`` and ``aarch64_pstate_za_preserved``. - -* It is not allowed for a function to be decorated with both - ``aarch64_pstate_za_new`` and ``aarch64_pstate_za_shared``. +* It is not allowed for a function to be decorated with more than one of the + following attributes: + ``aarch64_new_za``, ``aarch64_in_za``, ``aarch64_out_za``, ``aarch64_inout_za``, + ``aarch64_preserves_za``. These restrictions also apply in the higher level SME ACLE, which means we can emit diagnostics in Clang to signal users about incorrect behaviour. @@ -426,7 +431,7 @@ to toggle PSTATE.ZA using intrinsics. This also makes it simpler to setup a lazy-save mechanism for calls to private-ZA functions (i.e. functions that may either directly or indirectly clobber ZA state). -For the purpose of handling functions marked with ``aarch64_pstate_za_new``, +For the purpose of handling functions marked with ``aarch64_new_za``, we have introduced a new LLVM IR pass (SMEABIPass) that is run just before SelectionDAG. Any such functions dealt with by this pass are marked with ``aarch64_expanded_pstate_za``. diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 1408ce293ca6..438b4f49c75a 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2116,17 +2116,13 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, V); } - if (Attrs.hasFnAttr("aarch64_pstate_za_new")) { - Check(!Attrs.hasFnAttr("aarch64_pstate_za_preserved"), - "Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_preserved' " - "are incompatible!", - V); - - Check(!Attrs.hasFnAttr("aarch64_pstate_za_shared"), - "Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_shared' " - "are incompatible!", - V); - } + Check((Attrs.hasFnAttr("aarch64_new_za") + Attrs.hasFnAttr("aarch64_in_za") + + Attrs.hasFnAttr("aarch64_inout_za") + + Attrs.hasFnAttr("aarch64_out_za") + + Attrs.hasFnAttr("aarch64_preserves_za")) <= 1, + "Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', " + "'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive", + V); if (Attrs.hasFnAttr(Attribute::JumpTable)) { const GlobalValue *GV = cast(V); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2db3bb7b8cd8..029b0931eb95 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -230,7 +230,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, CalleeAttrs.set(SMEAttrs::SM_Enabled, true); } - if (CalleeAttrs.hasNewZABody()) + if (CalleeAttrs.isNewZA()) return false; if (CallerAttrs.requiresLazySave(CalleeAttrs) || diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 15e047e5e7de..2b713fe28d83 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -61,10 +61,8 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); - auto Attrs = - AttributeList() - .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible") - .addFnAttribute(M->getContext(), "aarch64_pstate_za_preserved"); + auto Attrs = AttributeList().addFnAttribute(M->getContext(), + "aarch64_pstate_sm_compatible"); FunctionCallee Callee = M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs); CallInst *Call = Builder.CreateCall(Callee); @@ -79,7 +77,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { } /// This function generates code at the beginning and end of a function marked -/// with either `aarch64_pstate_za_new` or `aarch64_new_zt0`. +/// with either `aarch64_new_za` or `aarch64_new_zt0`. /// At the beginning of the function, the following code is generated: /// - Commit lazy-save if active [Private-ZA Interface*] /// - Enable PSTATE.ZA [Private-ZA Interface] @@ -134,7 +132,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); } - if (FnAttrs.hasNewZABody()) { + if (FnAttrs.isNewZA()) { Function *ZeroIntr = Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, @@ -175,7 +173,7 @@ bool SMEABI::runOnFunction(Function &F) { bool Changed = false; SMEAttrs FnAttrs(F); - if (FnAttrs.hasNewZABody() || FnAttrs.isNewZT0()) + if (FnAttrs.isNewZA() || FnAttrs.isNewZT0()) Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); return Changed; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 7cb7b296a163..cd72944bcdb2 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -20,12 +20,16 @@ void SMEAttrs::set(unsigned M, bool Enable) { assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) && "SM_Enabled and SM_Compatible are mutually exclusive"); - assert(!(hasNewZABody() && hasSharedZAInterface()) && - "ZA_New and ZA_Shared are mutually exclusive"); - assert(!(hasNewZABody() && preservesZA()) && - "ZA_New and ZA_Preserved are mutually exclusive"); - assert(!(hasNewZABody() && (Bitmask & SME_ABI_Routine)) && + + // ZA Attrs + assert(!(isNewZA() && (Bitmask & SME_ABI_Routine)) && "ZA_New and SME_ABI_Routine are mutually exclusive"); + + assert( + (!sharesZA() || + (isNewZA() ^ isInZA() ^ isInOutZA() ^ isOutZA() ^ isPreservesZA())) && + "Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', " + "'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive"); } SMEAttrs::SMEAttrs(const CallBase &CB) { @@ -39,8 +43,8 @@ SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) { if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); if (FuncName == "__arm_tpidr2_restore") - Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared | - SMEAttrs::SME_ABI_Routine); + Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | + SMEAttrs::SME_ABI_Routine; } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { @@ -51,12 +55,16 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= SM_Compatible; if (Attrs.hasFnAttr("aarch64_pstate_sm_body")) Bitmask |= SM_Body; - if (Attrs.hasFnAttr("aarch64_pstate_za_shared")) - Bitmask |= ZA_Shared; - if (Attrs.hasFnAttr("aarch64_pstate_za_new")) - Bitmask |= ZA_New; - if (Attrs.hasFnAttr("aarch64_pstate_za_preserved")) - Bitmask |= ZA_Preserved; + if (Attrs.hasFnAttr("aarch64_in_za")) + Bitmask |= encodeZAState(StateValue::In); + if (Attrs.hasFnAttr("aarch64_out_za")) + Bitmask |= encodeZAState(StateValue::Out); + if (Attrs.hasFnAttr("aarch64_inout_za")) + Bitmask |= encodeZAState(StateValue::InOut); + if (Attrs.hasFnAttr("aarch64_preserves_za")) + Bitmask |= encodeZAState(StateValue::Preserved); + if (Attrs.hasFnAttr("aarch64_new_za")) + Bitmask |= encodeZAState(StateValue::New); } bool SMEAttrs::requiresSMChange(const SMEAttrs &Callee) const { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index b43014a5248d..8214fda99fb1 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -26,16 +26,24 @@ class SMEAttrs { unsigned Bitmask; public: + enum class StateValue { + None = 0, + In = 1, //aarch64_in_zt0 + Out = 2, //aarch64_out_zt0 + InOut = 3, //aarch64_inout_zt0 + Preserved = 4, //aarch64_preserves_zt0 + New = 5 //aarch64_new_zt0 + }; + // Enum with bitmasks for each individual SME feature. enum Mask { Normal = 0, SM_Enabled = 1 << 0, // aarch64_pstate_sm_enabled SM_Compatible = 1 << 1, // aarch64_pstate_sm_compatible SM_Body = 1 << 2, // aarch64_pstate_sm_body - ZA_Shared = 1 << 3, // aarch64_pstate_sm_shared - ZA_New = 1 << 4, // aarch64_pstate_sm_new - ZA_Preserved = 1 << 5, // aarch64_pstate_sm_preserved - SME_ABI_Routine = 1 << 6, // Used for SME ABI routines to avoid lazy saves + SME_ABI_Routine = 1 << 3, // Used for SME ABI routines to avoid lazy saves + ZA_Shift = 4, + ZA_Mask = 0b111 << ZA_Shift, }; SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); } @@ -66,14 +74,29 @@ public: /// streaming mode. bool requiresSMChange(const SMEAttrs &Callee) const; - // Interfaces to query PSTATE.ZA - bool hasNewZABody() const { return Bitmask & ZA_New; } - bool hasSharedZAInterface() const { return Bitmask & ZA_Shared; } - bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); } - bool preservesZA() const { return Bitmask & ZA_Preserved; } - bool hasZAState() const { - return hasNewZABody() || hasSharedZAInterface(); + // Interfaces to query ZA + static StateValue decodeZAState(unsigned Bitmask) { + return static_cast((Bitmask & ZA_Mask) >> ZA_Shift); + } + static unsigned encodeZAState(StateValue S) { + return static_cast(S) << ZA_Shift; } + + bool isNewZA() const { return decodeZAState(Bitmask) == StateValue::New; } + bool isInZA() const { return decodeZAState(Bitmask) == StateValue::In; } + bool isOutZA() const { return decodeZAState(Bitmask) == StateValue::Out; } + bool isInOutZA() const { return decodeZAState(Bitmask) == StateValue::InOut; } + bool isPreservesZA() const { + return decodeZAState(Bitmask) == StateValue::Preserved; + } + bool sharesZA() const { + StateValue State = decodeZAState(Bitmask); + return State == StateValue::In || State == StateValue::Out || + State == StateValue::InOut || State == StateValue::Preserved; + } + bool hasSharedZAInterface() const { return sharesZA() || sharesZT0(); } + bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); } + bool hasZAState() const { return isNewZA() || sharesZA(); } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && !(Callee.Bitmask & SME_ABI_Routine); diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index bbd96117d85e..86ee85ad50f9 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -199,9 +199,9 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno ; Check ZA state ; -declare double @za_shared_callee(double) "aarch64_pstate_za_shared" +declare double @za_shared_callee(double) "aarch64_inout_za" -define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_new"{ +define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: ; CHECK-COMMON: // %bb.0: // %prelude ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -238,7 +238,7 @@ entry: ret double %add; } -define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_shared"{ +define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{ ; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee: ; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -278,7 +278,7 @@ entry: } ; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls. -define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind { +define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-LABEL: f128_call_za: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -340,7 +340,7 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw } ; As above this should use Selection DAG to make sure the libcall call is lowered correctly. -define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nounwind { +define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-LABEL: frem_call_za: ; CHECK-COMMON: // %bb.0: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll index d999311301f9..65e50842d5d7 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll @@ -4,13 +4,13 @@ declare void @private_za_callee() declare float @llvm.cos.f32(float) -define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { +define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK: remark: :0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA call void @private_za_callee() ret void } -define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { +define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA call void @private_za_callee() ; CHECK: remark: :0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA @@ -18,7 +18,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { ret void } -define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" { +define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { ; CHECK: remark: :0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA %res = call float @llvm.cos.f32(float %a) ret float %res diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 7eb7eb21ca59..2cd03fac0999 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -5,7 +5,7 @@ declare void @private_za_callee() declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. -define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { +define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_1_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -38,7 +38,7 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" { } ; Test lazy-save mechanism for multiple callees. -define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { +define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_2_callees: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -85,7 +85,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" { } ; Test a call of an intrinsic that gets expanded to a library call. -define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" { +define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_expanded_intrinsic: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -118,7 +118,7 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta } ; Test a combination of streaming-compatible -> normal call with lazy-save. -define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" { +define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test_lazy_save_and_conditional_smstart: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll index 0cee26dbb349..04d26902c536 100644 --- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll +++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s ; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s -declare void @shared_za_callee() "aarch64_pstate_za_shared" +declare void @shared_za_callee() "aarch64_inout_za" -define void @private_za() "aarch64_pstate_za_new" { +define void @private_za() "aarch64_new_za" { ; CHECK-LABEL: @private_za( ; CHECK-NEXT: prelude: ; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() @@ -24,7 +24,7 @@ define void @private_za() "aarch64_pstate_za_new" { ret void } -define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_pstate_za_new" { +define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" { ; CHECK-LABEL: @private_za_multiple_exit( ; CHECK-NEXT: prelude: ; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() @@ -62,4 +62,4 @@ if.end: } ; CHECK: declare void @__arm_tpidr2_save() #[[ATTR:[0-9]+]] -; CHECK: attributes #[[ATTR]] = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" } +; CHECK: attributes #[[ATTR]] = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index c9714b9ad848..c885fa4a76ec 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -4,7 +4,7 @@ declare void @private_za_callee() ; Ensure that we don't use tail call optimization when a lazy-save is required. -define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind { +define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-LABEL: disable_tailcallopt: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -37,7 +37,7 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind { } ; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls -define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind { +define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-LABEL: f128_call_za: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll index 7fca45b1e43f..816492768cc0 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll @@ -22,7 +22,7 @@ entry: ret void } -define void @shared_za_callee() "aarch64_pstate_za_shared" { +define void @shared_za_callee() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_callee ; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: @@ -34,7 +34,7 @@ entry: ret void } -define void @new_za_callee() "aarch64_pstate_za_new" { +define void @new_za_callee() "aarch64_new_za" { ; CHECK-LABEL: define void @new_za_callee ; CHECK-SAME: () #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: call void @inlined_body() @@ -84,7 +84,7 @@ entry: ; [x] Z -> N ; [ ] Z -> S ; [ ] Z -> Z -define void @new_za_caller_nonza_callee_inline() "aarch64_pstate_za_new" { +define void @new_za_caller_nonza_callee_inline() "aarch64_new_za" { ; CHECK-LABEL: define void @new_za_caller_nonza_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: @@ -99,7 +99,7 @@ entry: ; [ ] Z -> N ; [x] Z -> S ; [ ] Z -> Z -define void @new_za_caller_shared_za_callee_inline() "aarch64_pstate_za_new" { +define void @new_za_caller_shared_za_callee_inline() "aarch64_new_za" { ; CHECK-LABEL: define void @new_za_caller_shared_za_callee_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: @@ -114,7 +114,7 @@ entry: ; [ ] Z -> N ; [ ] Z -> S ; [x] Z -> Z -define void @new_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_new" { +define void @new_za_caller_new_za_callee_dont_inline() "aarch64_new_za" { ; CHECK-LABEL: define void @new_za_caller_new_za_callee_dont_inline ; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: entry: @@ -129,7 +129,7 @@ entry: ; [x] Z -> N ; [ ] Z -> S ; [ ] Z -> Z -define void @shared_za_caller_nonza_callee_inline() "aarch64_pstate_za_shared" { +define void @shared_za_caller_nonza_callee_inline() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_nonza_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: @@ -144,7 +144,7 @@ entry: ; [ ] S -> N ; [x] S -> Z ; [ ] S -> S -define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_shared" { +define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_new_za_callee_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: @@ -159,7 +159,7 @@ entry: ; [ ] S -> N ; [ ] S -> Z ; [x] S -> S -define void @shared_za_caller_shared_za_callee_inline() "aarch64_pstate_za_shared" { +define void @shared_za_caller_shared_za_callee_inline() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_shared_za_callee_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: @@ -181,7 +181,7 @@ define void @private_za_callee_call_za_disable() { ret void } -define void @shared_za_caller_private_za_callee_call_za_disable() "aarch64_pstate_za_shared" { +define void @shared_za_caller_private_za_callee_call_za_disable() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_za_disable ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: call void @private_za_callee_call_za_disable() @@ -201,7 +201,7 @@ define void @private_za_callee_call_tpidr2_save() { ret void } -define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline() "aarch64_pstate_za_shared" { +define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline() "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: call void @private_za_callee_call_tpidr2_save() @@ -221,7 +221,7 @@ define void @private_za_callee_call_tpidr2_restore(ptr %ptr) { ret void } -define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline(ptr %ptr) "aarch64_pstate_za_shared" { +define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline(ptr %ptr) "aarch64_inout_za" { ; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: call void @private_za_callee_call_tpidr2_restore(ptr [[PTR]]) diff --git a/llvm/test/Verifier/sme-attributes.ll b/llvm/test/Verifier/sme-attributes.ll index 7f788cfd09f0..3d01613ebf2f 100644 --- a/llvm/test/Verifier/sme-attributes.ll +++ b/llvm/test/Verifier/sme-attributes.ll @@ -3,8 +3,62 @@ declare void @sm_attrs() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_compatible"; ; CHECK: Attributes 'aarch64_pstate_sm_enabled and aarch64_pstate_sm_compatible' are incompatible! -declare void @za_preserved() "aarch64_pstate_za_new" "aarch64_pstate_za_preserved"; -; CHECK: Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_preserved' are incompatible! +declare void @za_new_preserved() "aarch64_new_za" "aarch64_preserves_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive -declare void @za_shared() "aarch64_pstate_za_new" "aarch64_pstate_za_shared"; -; CHECK: Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_shared' are incompatible! +declare void @za_new_in() "aarch64_new_za" "aarch64_in_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_new_inout() "aarch64_new_za" "aarch64_inout_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_new_out() "aarch64_new_za" "aarch64_out_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_preserved_in() "aarch64_preserves_za" "aarch64_in_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_preserved_inout() "aarch64_preserves_za" "aarch64_inout_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_preserved_out() "aarch64_preserves_za" "aarch64_out_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_in_inout() "aarch64_in_za" "aarch64_inout_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_in_out() "aarch64_in_za" "aarch64_out_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @za_inout_out() "aarch64_inout_za" "aarch64_out_za"; +; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive + +declare void @zt0_new_preserved() "aarch64_new_zt0" "aarch64_preserves_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_new_in() "aarch64_new_zt0" "aarch64_in_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_new_inout() "aarch64_new_zt0" "aarch64_inout_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_new_out() "aarch64_new_zt0" "aarch64_out_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_preserved_in() "aarch64_preserves_zt0" "aarch64_in_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_preserved_inout() "aarch64_preserves_zt0" "aarch64_inout_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_preserved_out() "aarch64_preserves_zt0" "aarch64_out_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_in_inout() "aarch64_in_zt0" "aarch64_inout_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_in_out() "aarch64_in_zt0" "aarch64_out_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive + +declare void @zt0_inout_out() "aarch64_inout_zt0" "aarch64_out_zt0"; +; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index 5ac143b52a25..ab96ee147a9a 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -38,25 +38,25 @@ TEST(SMEAttributes, Constructors) { ->getFunction("foo")) .hasStreamingCompatibleInterface()); - ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_shared\"") - ->getFunction("foo")) - .hasSharedZAInterface()); - - ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_new\"") + ASSERT_TRUE( + SA(*parseIR("declare void @foo() \"aarch64_in_za\"")->getFunction("foo")) + .isInZA()); + ASSERT_TRUE( + SA(*parseIR("declare void @foo() \"aarch64_out_za\"")->getFunction("foo")) + .isOutZA()); + ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_inout_za\"") ->getFunction("foo")) - .hasNewZABody()); - - ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_preserved\"") + .isInOutZA()); + ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_preserves_za\"") ->getFunction("foo")) - .preservesZA()); + .isPreservesZA()); + ASSERT_TRUE( + SA(*parseIR("declare void @foo() \"aarch64_new_za\"")->getFunction("foo")) + .isNewZA()); // Invalid combinations. EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible), "SM_Enabled and SM_Compatible are mutually exclusive"); - EXPECT_DEBUG_DEATH(SA(SA::ZA_New | SA::ZA_Shared), - "ZA_New and ZA_Shared are mutually exclusive"); - EXPECT_DEBUG_DEATH(SA(SA::ZA_New | SA::ZA_Preserved), - "ZA_New and ZA_Preserved are mutually exclusive"); // Test that the set() methods equally check validity. EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled).set(SA::SM_Compatible), @@ -79,22 +79,69 @@ TEST(SMEAttributes, Basics) { ASSERT_TRUE(SA(SA::SM_Compatible | SA::SM_Body).hasStreamingBody()); ASSERT_FALSE(SA(SA::SM_Compatible | SA::SM_Body).hasNonStreamingInterface()); - // Test PSTATE.ZA interfaces. - ASSERT_FALSE(SA(SA::ZA_Shared).hasPrivateZAInterface()); - ASSERT_TRUE(SA(SA::ZA_Shared).hasSharedZAInterface()); - ASSERT_TRUE(SA(SA::ZA_Shared).hasZAState()); - ASSERT_FALSE(SA(SA::ZA_Shared).preservesZA()); - ASSERT_TRUE(SA(SA::ZA_Shared | SA::ZA_Preserved).preservesZA()); - - ASSERT_TRUE(SA(SA::ZA_New).hasPrivateZAInterface()); - ASSERT_TRUE(SA(SA::ZA_New).hasNewZABody()); - ASSERT_TRUE(SA(SA::ZA_New).hasZAState()); - ASSERT_FALSE(SA(SA::ZA_New).preservesZA()); - - ASSERT_TRUE(SA(SA::Normal).hasPrivateZAInterface()); - ASSERT_FALSE(SA(SA::Normal).hasNewZABody()); + // Test ZA State interfaces + SA ZA_In = SA(SA::encodeZAState(SA::StateValue::In)); + ASSERT_TRUE(ZA_In.isInZA()); + ASSERT_FALSE(ZA_In.isOutZA()); + ASSERT_FALSE(ZA_In.isInOutZA()); + ASSERT_FALSE(ZA_In.isPreservesZA()); + ASSERT_FALSE(ZA_In.isNewZA()); + ASSERT_TRUE(ZA_In.sharesZA()); + ASSERT_TRUE(ZA_In.hasZAState()); + ASSERT_TRUE(ZA_In.hasSharedZAInterface()); + ASSERT_FALSE(ZA_In.hasPrivateZAInterface()); + + SA ZA_Out = SA(SA::encodeZAState(SA::StateValue::Out)); + ASSERT_TRUE(ZA_Out.isOutZA()); + ASSERT_FALSE(ZA_Out.isInZA()); + ASSERT_FALSE(ZA_Out.isInOutZA()); + ASSERT_FALSE(ZA_Out.isPreservesZA()); + ASSERT_FALSE(ZA_Out.isNewZA()); + ASSERT_TRUE(ZA_Out.sharesZA()); + ASSERT_TRUE(ZA_Out.hasZAState()); + ASSERT_TRUE(ZA_Out.hasSharedZAInterface()); + ASSERT_FALSE(ZA_Out.hasPrivateZAInterface()); + + SA ZA_InOut = SA(SA::encodeZAState(SA::StateValue::InOut)); + ASSERT_TRUE(ZA_InOut.isInOutZA()); + ASSERT_FALSE(ZA_InOut.isInZA()); + ASSERT_FALSE(ZA_InOut.isOutZA()); + ASSERT_FALSE(ZA_InOut.isPreservesZA()); + ASSERT_FALSE(ZA_InOut.isNewZA()); + ASSERT_TRUE(ZA_InOut.sharesZA()); + ASSERT_TRUE(ZA_InOut.hasZAState()); + ASSERT_TRUE(ZA_InOut.hasSharedZAInterface()); + ASSERT_FALSE(ZA_InOut.hasPrivateZAInterface()); + + SA ZA_Preserved = SA(SA::encodeZAState(SA::StateValue::Preserved)); + ASSERT_TRUE(ZA_Preserved.isPreservesZA()); + ASSERT_FALSE(ZA_Preserved.isInZA()); + ASSERT_FALSE(ZA_Preserved.isOutZA()); + ASSERT_FALSE(ZA_Preserved.isInOutZA()); + ASSERT_FALSE(ZA_Preserved.isNewZA()); + ASSERT_TRUE(ZA_Preserved.sharesZA()); + ASSERT_TRUE(ZA_Preserved.hasZAState()); + ASSERT_TRUE(ZA_Preserved.hasSharedZAInterface()); + ASSERT_FALSE(ZA_Preserved.hasPrivateZAInterface()); + + SA ZA_New = SA(SA::encodeZAState(SA::StateValue::New)); + ASSERT_TRUE(ZA_New.isNewZA()); + ASSERT_FALSE(ZA_New.isInZA()); + ASSERT_FALSE(ZA_New.isOutZA()); + ASSERT_FALSE(ZA_New.isInOutZA()); + ASSERT_FALSE(ZA_New.isPreservesZA()); + ASSERT_FALSE(ZA_New.sharesZA()); + ASSERT_TRUE(ZA_New.hasZAState()); + ASSERT_FALSE(ZA_New.hasSharedZAInterface()); + ASSERT_TRUE(ZA_New.hasPrivateZAInterface()); + + ASSERT_FALSE(SA(SA::Normal).isInZA()); + ASSERT_FALSE(SA(SA::Normal).isOutZA()); + ASSERT_FALSE(SA(SA::Normal).isInOutZA()); + ASSERT_FALSE(SA(SA::Normal).isPreservesZA()); + ASSERT_FALSE(SA(SA::Normal).isNewZA()); + ASSERT_FALSE(SA(SA::Normal).sharesZA()); ASSERT_FALSE(SA(SA::Normal).hasZAState()); - ASSERT_FALSE(SA(SA::Normal).preservesZA()); } TEST(SMEAttributes, Transitions) { -- Gitee From 0c96571d0a06b58cf0a6dde8a6221467f283ea5d Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Fri, 2 Feb 2024 08:12:05 +0000 Subject: [PATCH 65/77] [AArch64][SME] Implement inline-asm clobbers for za/zt0 (#79276) This enables specifing "za" or "zt0" to the clobber list for inline asm. This complies with the acle SME addition to the asm extension here: https://github.com/ARM-software/acle/pull/276 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/Basic/Targets/AArch64.cpp | 9 ++++++++- clang/test/CodeGen/aarch64-inline-asm.c | 8 ++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++++++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 4 ++++ llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll | 16 ++++++++++++++++ 5 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index e64e8bb23d20..ed77694164c8 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1146,6 +1146,8 @@ TargetInfo::BuiltinVaListKind AArch64TargetInfo::getBuiltinVaListKind() const { } const char *const AArch64TargetInfo::GCCRegNames[] = { + // clang-format off + // 32-bit Integer registers "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", @@ -1182,7 +1184,12 @@ const char *const AArch64TargetInfo::GCCRegNames[] = { // SVE predicate-as-counter registers "pn0", "pn1", "pn2", "pn3", "pn4", "pn5", "pn6", "pn7", "pn8", - "pn9", "pn10", "pn11", "pn12", "pn13", "pn14", "pn15" + "pn9", "pn10", "pn11", "pn12", "pn13", "pn14", "pn15", + + // SME registers + "za", "zt0", + + // clang-format on }; ArrayRef AArch64TargetInfo::getGCCRegNames() const { diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c index 75e9a8c46b87..8ddee560b11d 100644 --- a/clang/test/CodeGen/aarch64-inline-asm.c +++ b/clang/test/CodeGen/aarch64-inline-asm.c @@ -95,3 +95,11 @@ void test_reduced_gpr_constraints(int var32, long var64) { // CHECK: [[ARG2:%.+]] = load i64, ptr // CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]]) } + +void test_sme_constraints(){ + asm("movt zt0[3, mul vl], z0" : : : "za"); +// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"() + + asm("movt zt0[3, mul vl], z0" : : : "zt0"); +// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"() +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 79ce886394ce..1955ba39bbe9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10364,6 +10364,14 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( parseConstraintCode(Constraint) != AArch64CC::Invalid) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); + if (Constraint == "{za}") { + return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass); + } + + if (Constraint == "{zt0}") { + return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass); + } + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 7372dbf7ec86..b36820831f49 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -491,6 +491,10 @@ bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF, MCRegisterInfo::regsOverlap(PhysReg, AArch64::X16)) return true; + // ZA/ZT0 registers are reserved but may be permitted in the clobber list. + if (PhysReg == AArch64::ZA || PhysReg == AArch64::ZT0) + return true; + return !isReservedReg(MF, PhysReg); } diff --git a/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll new file mode 100644 index 000000000000..a8cba7dc9a91 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu -stop-after=aarch64-isel < %s -o - | FileCheck %s + +define void @alpha( %x) local_unnamed_addr { +entry: +; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $za + tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"() + ret void +} + +define void @beta( %x) local_unnamed_addr { +entry: +; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $zt0 + tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"() + ret void +} -- Gitee From dd8d8d894eb3ef1f3f48846c4fb8a6c927354172 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 2 Feb 2024 09:29:47 +0000 Subject: [PATCH 66/77] [Clang][AArch64] Add missing SME macros (#80293) __ARM_STATE_ZA and __ARM_STATE_ZT0 are set when the compiler can parse the "za" and "zt0" strings in the SME attributes. __ARM_FEATURE_SME and __ARM_FEATURE_SME2 are set when the compiler can generate code for attributes with "za" and "zt0" state, respectively. __ARM_FEATURE_LOCALLY_STREAMING is set when the compiler supports the __arm_locally_streaming attribute. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/lib/Basic/Targets/AArch64.cpp | 23 +++++++++++++++++++ clang/lib/Basic/Targets/AArch64.h | 1 + .../Preprocessor/aarch64-target-features.c | 13 +++++++++++ clang/test/Preprocessor/init-aarch64.c | 2 ++ 4 files changed, 39 insertions(+) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index ed77694164c8..619677c1c1a4 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -374,6 +374,11 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__ARM_ALIGN_MAX_STACK_PWR", "4"); + // These macros are set when Clang can parse declarations with these + // attributes. + Builder.defineMacro("__ARM_STATE_ZA", "1"); + Builder.defineMacro("__ARM_STATE_ZT0", "1"); + // 0xe implies support for half, single and double precision operations. if (FPU & FPUMode) Builder.defineMacro("__ARM_FP", "0xE"); @@ -418,6 +423,17 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2 && HasSVE2SM4) Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1"); + if (HasSME) { + Builder.defineMacro("__ARM_FEATURE_SME"); + Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); + } + + if (HasSME2) { + Builder.defineMacro("__ARM_FEATURE_SME"); + Builder.defineMacro("__ARM_FEATURE_SME2"); + Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); + } + if (HasCRC) Builder.defineMacro("__ARM_FEATURE_CRC32", "1"); @@ -670,6 +686,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sme", HasSME) + .Case("sme2", HasSME2) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Case("sme-fa64", HasSMEFA64) @@ -790,6 +807,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasBFloat16 = true; HasFullFP16 = true; } + if (Feature == "+sme2") { + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + } if (Feature == "+sme-f64f64") { HasSME = true; HasSMEF64F64 = true; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 76d71aca76bd..77040949fcbe 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -68,6 +68,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasCCDP = false; bool HasFRInt3264 = false; bool HasSME = false; + bool HasSME2 = false; bool HasSMEF64F64 = false; bool HasSMEI16I64 = false; bool HasSB = false; diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 7f2b353ab18c..703294ef13bb 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -58,6 +58,10 @@ // CHECK-NOT: __ARM_FEATURE_SVE_BITS 512 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 1024 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 2048 +// CHECK: __ARM_STATE_ZA 1 +// CHECK: __ARM_STATE_ZT0 1 +// CHECK-NOT: __ARM_FEATURE_SME +// CHECK-NOT: __ARM_FEATURE_SME2 // RUN: %clang -target aarch64-none-elf -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE // RUN: %clang -target arm64-none-linux-gnu -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE @@ -616,3 +620,12 @@ // RUN: %clang --target=aarch64 -march=armv8.2-a+rcpc3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-RCPC3 %s // CHECK-RCPC3: __ARM_FEATURE_RCPC 3 + +// RUN: %clang --target=aarch64 -march=armv9-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME %s +// CHECK-SME: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SME: __ARM_FEATURE_SME 1 +// +// RUN: %clang --target=aarch64 -march=armv9-a+sme2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2 %s +// CHECK-SME2: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SME2: __ARM_FEATURE_SME 1 +// CHECK-SME2: __ARM_FEATURE_SME2 1 diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index 2b7cc57f2303..6cc9cb22c7af 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -32,6 +32,8 @@ // AARCH64-NEXT: #define __ARM_PCS_AAPCS64 1 // AARCH64-NEXT: #define __ARM_SIZEOF_MINIMAL_ENUM 4 // AARCH64-NEXT: #define __ARM_SIZEOF_WCHAR_T 4 +// AARCH64-NEXT: #define __ARM_STATE_ZA 1 +// AARCH64-NEXT: #define __ARM_STATE_ZT0 1 // AARCH64-NEXT: #define __ATOMIC_ACQUIRE 2 // AARCH64-NEXT: #define __ATOMIC_ACQ_REL 4 // AARCH64-NEXT: #define __ATOMIC_CONSUME 1 -- Gitee From 6a0305e3db4f731f18dacc4b6af52af8677774d4 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 6 Feb 2024 10:42:44 +0000 Subject: [PATCH 67/77] [Clang][AArch64] Fix some target guards and remove +sve from tests. (#80681) The TargetGuard fields for 'svldr[_vnum]_za' and 'svstr[_vnum]_za' were incorrectly set to `+sve` instead of `+sme`. This means that compiling code that uses these intrinsics requires compiling for both `+sve` as well as `+sme`. This PR also fixes the target guards for the `svadd` and `svsub` builtins that are enabled under `+sme2,+sme-i16i64` and `+sme2,+sme-f64f64`, as it initially did the following: ``` let TargetGuard = "+sme2" in { let TargetGuard = "+sme-i16i64" in { // Builtins defined here will be predicated only by // '+sme-i16i64', and not '+sme2,+sme-i16i64'. } } ``` This PR also removes `-target-feature +sve` from all the SME tests, to ensure that the SME features are sufficient to build the tests. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- clang/include/clang/Basic/arm_sme.td | 28 +++++++++++-------- .../aarch64-sme-intrinsics/acle_sme_add-i32.c | 10 +++---- .../aarch64-sme-intrinsics/acle_sme_add-i64.c | 10 +++---- .../aarch64-sme-intrinsics/acle_sme_cnt.c | 6 ++-- .../aarch64-sme-intrinsics/acle_sme_ld1.c | 6 ++-- .../acle_sme_ld1_vnum.c | 6 ++-- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 6 ++-- .../acle_sme_mopa-za32.c | 10 +++---- .../acle_sme_mopa-za64.c | 10 +++---- .../acle_sme_mops-za32.c | 10 +++---- .../acle_sme_mops-za64.c | 10 +++---- .../aarch64-sme-intrinsics/acle_sme_read.c | 10 +++---- .../aarch64-sme-intrinsics/acle_sme_st1.c | 6 ++-- .../acle_sme_st1_vnum.c | 6 ++-- .../aarch64-sme-intrinsics/acle_sme_str.c | 6 ++-- .../aarch64-sme-intrinsics/acle_sme_write.c | 10 +++---- .../aarch64-sme-intrinsics/acle_sme_zero.c | 6 ++-- 17 files changed, 80 insertions(+), 76 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 27dde7e84e96..dc998d1f5668 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -44,6 +44,7 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>; +let TargetGuard = "sme" in { def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr">; @@ -51,6 +52,7 @@ def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "", def SVLDR_ZA : MInst<"svldr_za", "vmQ", "", [IsOverloadNone, IsStreamingCompatible, IsInOutZA], MemEltTyDefault, "aarch64_sme_ldr", []>; +} //////////////////////////////////////////////////////////////////////////////// // Stores @@ -81,6 +83,7 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>; defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>; +let TargetGuard = "sme" in { def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str">; @@ -88,6 +91,7 @@ def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "", def SVSTR_ZA : MInst<"svstr_za", "vm%", "", [IsOverloadNone, IsStreamingCompatible, IsInZA], MemEltTyDefault, "aarch64_sme_str", []>; +} //////////////////////////////////////////////////////////////////////////////// // Read horizontal/vertical ZA slices @@ -277,22 +281,22 @@ multiclass ZAAddSub { def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>; def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>; + } - let TargetGuard = "sme-i16i64" in { - def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; + let TargetGuard = "sme2,sme-i16i64" in { + def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; - } + def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; + } - let TargetGuard = "sme-f64f64" in { - def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; - def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; - } + let TargetGuard = "sme2,sme-f64f64" in { + def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c index 89aef63bc34b..ce951f4423b6 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c index a5e04ff9071c..d15a068fa640 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c index 5e5f3108bccd..c80c8bd37e95 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c index 37812bba4969..6fbbadc7fc8c 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c index 29357055bb97..37e0a5853e77 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 627f770d3ce4..bf7cffc1b083 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c index e6f8fbefa34f..bceb2d6f83df 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c index c55ff1cc5e99..7cdeb8731d0b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c index aa853e43dac8..51fffd42b944 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c index e8a54c1fbb54..24141cd1f20b 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c index f11e1a87cf33..2974511e4ebb 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c index 01681eab04b5..6b0dd67a16fb 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c index 9c21c7b14a56..be79674ea684 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 0133a3d9a1cb..caac3ea6596f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c index 64e7c10e3022..c76be10518c0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -1,9 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c index 2c908b5611dd..72db124537bd 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -1,7 +1,7 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s #include -- Gitee From 079b5b6dfa5f7ab84b08698114d045c23ed36522 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Thu, 22 Feb 2024 13:07:31 +0000 Subject: [PATCH 68/77] [Clang][SME] Detect always_inline used with mismatched streaming attributes (#77936) This patch adds an error that is emitted when a streaming function is marked as always_inline and is called from a non-streaming function. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../clang/Basic/DiagnosticFrontendKinds.td | 4 ++ clang/lib/CodeGen/Targets/AArch64.cpp | 43 +++++++++++++++++ .../aarch64-sme-inline-streaming-attrs.c | 47 +++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index 11022962ae9e..db2428caa5c8 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -273,6 +273,10 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">; def err_function_needs_feature : Error< "always_inline function %1 requires target feature '%2', but would " "be inlined into function %0 that is compiled without support for '%2'">; +def err_function_always_inline_attribute_mismatch : Error< + "always_inline function %1 and its caller %0 have mismatching %2 attributes">; +def err_function_always_inline_new_za : Error< + "always_inline function %0 has new za state">; def warn_avx_calling_convention : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' " diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 561110ff8c0d..cca1aea5dc67 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/DiagnosticFrontend.h" using namespace clang; using namespace clang::CodeGen; @@ -151,6 +152,11 @@ public: } return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty); } + + void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, + const FunctionDecl *Caller, + const FunctionDecl *Callee, + const CallArgList &Args) const override; }; class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo { @@ -811,6 +817,43 @@ Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr, /*allowHigherAlign*/ false); } +static bool isStreaming(const FunctionDecl *F) { + if (F->hasAttr()) + return true; + if (const auto *T = F->getType()->getAs()) + return T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask; + return false; +} + +static bool isStreamingCompatible(const FunctionDecl *F) { + if (const auto *T = F->getType()->getAs()) + return T->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMCompatibleMask; + return false; +} + +void AArch64TargetCodeGenInfo::checkFunctionCallABI( + CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller, + const FunctionDecl *Callee, const CallArgList &Args) const { + if (!Caller || !Callee || !Callee->hasAttr()) + return; + + bool CallerIsStreaming = isStreaming(Caller); + bool CalleeIsStreaming = isStreaming(Callee); + bool CallerIsStreamingCompatible = isStreamingCompatible(Caller); + bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee); + + if (!CalleeIsStreamingCompatible && + (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible)) + CGM.getDiags().Report(CallLoc, + diag::err_function_always_inline_attribute_mismatch) + << Caller->getDeclName() << Callee->getDeclName() << "streaming"; + if (auto *NewAttr = Callee->getAttr()) + if (NewAttr->isNewZA()) + CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za) + << Callee->getDeclName(); +} + std::unique_ptr CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM, AArch64ABIKind Kind) { diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c new file mode 100644 index 000000000000..7eb74f28a1c8 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s + +#define __ai __attribute__((always_inline)) +__ai void inlined_fn(void) {} +__ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {} +__ai void inlined_fn_streaming(void) __arm_streaming {} +__ai __arm_locally_streaming void inlined_fn_local(void) {} + +#ifdef TEST_NONE +void caller(void) { + inlined_fn(); + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}} + inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}} +} +#endif + +#ifdef TEST_COMPATIBLE +void caller_compatible(void) __arm_streaming_compatible { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}} + inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}} +} +#endif + +#ifdef TEST_STREAMING +void caller_streaming(void) __arm_streaming { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); + inlined_fn_local(); +} +#endif + +#ifdef TEST_LOCALLY +__arm_locally_streaming +void caller_local(void) { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); + inlined_fn_local(); +} +#endif -- Gitee From f2bab3a2f024f17dcc2e5d13bc2447fc1d0c24db Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Thu, 22 Feb 2024 09:19:48 +0000 Subject: [PATCH 69/77] [AArch64] Restore Z-registers before P-registers (#79623) (#82492) This is needed by PR#77665[1] that uses a P-register while restoring Z-registers. The reverse for SVE register restore in the epilogue was added to guarantee performance, but further work was done to improve sve frame restore and besides that the schedule also may change the order of the restore, undoing the reverse restore. This also fix the problem reported in (PR #79623) on Windows with std::reverse and .base(). [1]https://github.com/llvm/llvm-project/pull/77665 Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64FrameLowering.cpp | 19 ++-- .../framelayout-sve-calleesaves-fix.mir | 2 +- llvm/test/CodeGen/AArch64/framelayout-sve.mir | 24 ++--- .../sme-streaming-compatible-interface.ll | 48 +++++----- .../AArch64/sme-streaming-interface.ll | 48 +++++----- llvm/test/CodeGen/AArch64/sve-alloca.ll | 24 ++--- .../AArch64/sve-calling-convention-mixed.ll | 48 +++++----- llvm/test/CodeGen/AArch64/sve-tailcall.ll | 48 +++++----- llvm/test/CodeGen/AArch64/unwind-preserved.ll | 96 +++++++++---------- 9 files changed, 179 insertions(+), 178 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 83af97f8c8c7..e8071bd7171e 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2978,11 +2978,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return MIB->getIterator(); }; - // SVE objects are always restored in reverse order. - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (RPI.isScalable()) - EmitMI(RPI); - if (homogeneousPrologEpilog(MF, &MBB)) { auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); @@ -2993,11 +2988,19 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } + // For performance reasons restore SVE register in increasing order + auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); + std::reverse(PPRBegin, PPREnd); + auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); + std::reverse(ZPRBegin, ZPREnd); + if (ReverseCSRRestoreSeq) { MachineBasicBlock::iterator First = MBB.end(); for (const RegPairInfo &RPI : reverse(RegPairs)) { - if (RPI.isScalable()) - continue; MachineBasicBlock::iterator It = EmitMI(RPI); if (First == MBB.end()) First = It; @@ -3006,8 +3009,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MBB.splice(MBBI, &MBB, First); } else { for (const RegPairInfo &RPI : RegPairs) { - if (RPI.isScalable()) - continue; (void)EmitMI(RPI); } } diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir index 7d7b3ace8a91..8725bb7061fa 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir @@ -19,8 +19,8 @@ ; CHECK-NEXT: // implicit-def: $p4 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG - ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload + ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 7c87587c6dc4..eae547fc7033 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -771,9 +771,9 @@ body: | # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 # CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 -# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 @@ -872,14 +872,14 @@ body: | # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 -# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 -# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 -# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15 # CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2 # CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3 # CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 # CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17 +# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 +# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 +# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 +# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 @@ -1036,14 +1036,14 @@ body: | # CHECK-NEXT: $sp = ANDXri killed $[[TMP]] # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 +# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3 +# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15 -# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 -# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3 -# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 -# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10 @@ -1197,10 +1197,10 @@ body: | # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 7 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6 -# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6 +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 6165ccb14aae..fb3cdbb39865 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -226,18 +226,6 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( @streaming_compatible_with_predicate_vectors( @smstart_clobber_sve( %x) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -215,6 +203,18 @@ define @smstart_clobber_sve( %x) #0 { ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -267,18 +267,6 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -295,6 +283,18 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll index 90eed07c242b..565311d8e3f8 100644 --- a/llvm/test/CodeGen/AArch64/sve-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll @@ -66,18 +66,6 @@ define void @foo( %dst, i1 %cond) { ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: bl bar ; CHECK-NEXT: addvl sp, x29, #-18 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -94,6 +82,18 @@ define void @foo( %dst, i1 %cond) { ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x28, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index a97649523565..c551fcf4da70 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -567,18 +567,6 @@ define @sve_caller_non_sve_callee_high_range( @sve_caller_non_sve_callee_high_range( @sve_ret_caller_non_sve_callee_high_range() { ; CHECK-NEXT: addvl x0, sp, #1 ; CHECK-NEXT: bl non_sve_callee_high_range ; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -687,6 +675,18 @@ define @sve_ret_caller_non_sve_callee_high_range() { ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll index 58135e44fa91..4ddf007768fd 100644 --- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll +++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll @@ -83,18 +83,6 @@ define i32 @sve_caller_non_sve_callee( %arg) nounwind { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: bl non_sve_callee -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -111,6 +99,18 @@ define i32 @sve_caller_non_sve_callee( %arg) nounwind { ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -158,18 +158,6 @@ define i32 @sve_caller_non_sve_callee_fastcc( %arg) nounwind { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: bl non_sve_callee -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -186,6 +174,18 @@ define i32 @sve_caller_non_sve_callee_fastcc( %arg) nounwind { ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll index f3c4d217e6fc..822be14faaeb 100644 --- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll +++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll @@ -63,18 +63,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -91,6 +79,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 @@ -112,18 +112,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -140,6 +128,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 @@ -215,18 +215,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #2 ; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -243,6 +231,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #18 ; GISEL-NEXT: .cfi_def_cfa wsp, 16 ; GISEL-NEXT: .cfi_restore z8 @@ -264,18 +264,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #2 ; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -292,6 +280,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #18 ; GISEL-NEXT: .cfi_def_cfa wsp, 16 ; GISEL-NEXT: .cfi_restore z8 -- Gitee From 120fd0f7e45b31d9167b80b1c6e6db755ecf73f0 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 29 Feb 2024 15:35:46 +0000 Subject: [PATCH 70/77] [AArch64] Re-enable rematerialization for streaming-mode-changing functions. (#83235) We can add implicit defs/uses of the 'VG' register to the instructions to prevent the register allocator from rematerializing values in between streaming-mode changes, as the def/use of VG will further nail down the ordering that comes out of ISel. This avoids the heavy-handed approach to prevent any kind of rematerialization. While we could add 'VG' as a Use to all SVE instructions, we only really need to do this for instructions that are rematerializable, as the smstart/smstop instructions and pseudos act as scheduling barriers which is sufficient to prevent other instructions from being scheduled in between the streaming-mode-changing call sequence. However, we may revisit this in the future. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 16 +++++++++++++ .../lib/Target/AArch64/AArch64InstrFormats.td | 1 + .../Target/AArch64/AArch64RegisterInfo.cpp | 3 +++ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 ++ llvm/lib/Target/AArch64/SMEInstrFormats.td | 2 ++ llvm/lib/Target/AArch64/SVEInstrFormats.td | 10 ++++++++ .../AArch64/debug-info-sve-dbg-declare.mir | 2 +- .../AArch64/debug-info-sve-dbg-value.mir | 2 +- .../CodeGen/AArch64/live-debugvalues-sve.mir | 4 ++-- .../CodeGen/AArch64/sve-localstackalloc.mir | 2 +- .../AArch64/sve-pfalse-machine-cse.mir | 6 ++--- .../AArch64/sve-pseudos-expand-undef.mir | 4 ++-- .../AArch64/sve-ptest-removal-cmpeq.mir | 10 ++++---- .../AArch64/sve-ptest-removal-whilege.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilegt.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilehi.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilehs.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilele.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilelo.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilels.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilelt.mir | 24 +++++++++---------- .../AArch64/sve-ptest-removal-whilerw.mir | 16 ++++++------- .../AArch64/sve-ptest-removal-whilewr.mir | 16 ++++++------- 23 files changed, 161 insertions(+), 127 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1955ba39bbe9..e15153c62857 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7223,6 +7223,22 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, (AArch64::GPR32RegClass.contains(MO.getReg()) || AArch64::GPR64RegClass.contains(MO.getReg()))) MI.removeOperand(I); + + // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that + // have nothing to do with VG, were it not that they are used to materialise a + // frame-address. If they contain a frame-index to a scalable vector, this + // will likely require an ADDVL instruction to materialise the address, thus + // reading VG. + const MachineFunction &MF = *MI.getMF(); + if (MF.getInfo()->hasStreamingModeChanges() && + (MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri)) { + const MachineOperand &MO = MI.getOperand(1); + if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) == + TargetStackID::ScalableVector) + MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, + /*IsImplicit=*/true)); + } } SDValue AArch64TargetLowering::changeStreamingMode( diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 2d52b7c409b4..b645f16d062a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -2670,6 +2670,7 @@ class AddSubImmShift lsl #0, '01' => lsl #12 let Inst{21-10} = imm{11-0}; let DecoderMethod = "DecodeAddSubImmShift"; + let hasPostISelHook = 1; } class BaseAddSubRegPseudo, Sched<[WriteSys]> { let hasPostISelHook = 1; + let Uses = [VG]; + let Defs = [VG]; } def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)), diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 835e74fdbc64..04ae6d79f384 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -222,6 +222,8 @@ def MSRpstatesvcrImm1 let Inst{8} = imm; let Inst{7-5} = 0b011; // op2 let hasPostISelHook = 1; + let Uses = [VG]; + let Defs = [VG]; } def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a35a3e4f40c5..05115aa252a1 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -365,6 +365,7 @@ class sve_int_ptrue sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let ElementSize = pprty.ElementSize; let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_ptrue opc, string asm, SDPatternOperator op> { @@ -755,6 +756,7 @@ class sve_int_pfalse opc, string asm> let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_pfalse opc, string asm> { @@ -1090,6 +1092,7 @@ class sve_int_count opc, string asm> let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_count opc, string asm, SDPatternOperator op> { @@ -1982,6 +1985,7 @@ class sve_int_dup_mask_imm let DecoderMethod = "DecodeSVELogicalImmInstruction"; let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_dup_mask_imm { @@ -2819,6 +2823,7 @@ class sve_int_arith_vl let Inst{4-0} = Rd; let hasSideEffects = 0; + let Uses = [VG]; } class sve_int_read_vl_a opc2, string asm, bit streaming_sve = 0b0> @@ -2839,6 +2844,7 @@ class sve_int_read_vl_a opc2, string asm, bit streaming_sve = 0b let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } //===----------------------------------------------------------------------===// @@ -4656,6 +4662,7 @@ class sve_int_dup_imm sz8_64, string asm, let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_dup_imm { @@ -4698,6 +4705,7 @@ class sve_int_dup_fpimm sz8_64, Operand fpimmtype, let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_dup_fpimm { @@ -5614,6 +5622,7 @@ class sve_int_index_ii sz8_64, string asm, ZPRRegOp zprty, let hasSideEffects = 0; let isReMaterializable = 1; + let Uses = [VG]; } multiclass sve_int_index_ii { @@ -9249,6 +9258,7 @@ class sve2p1_ptrue_pn sz, PNRP8to15RegOp pnrty, SDPatte let Inst{2-0} = PNd; let hasSideEffects = 0; + let Uses = [VG]; } diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir index 22cff36afaf7..015e8e9ba2b6 100644 --- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir +++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir @@ -193,7 +193,7 @@ body: | liveins: $z0, $z1, $p0, $p1, $w0 renamable $p2 = COPY killed $p0 - renamable $p0 = PTRUE_S 31 + renamable $p0 = PTRUE_S 31, implicit $vg ST1W_IMM killed renamable $z0, renamable $p0, %stack.0.z0.addr, 0 :: (store unknown-size into %ir.z0.addr, align 16) ST1W_IMM killed renamable $z1, renamable $p0, %stack.1.z1.addr, 0 :: (store unknown-size into %ir.z1.addr, align 16) STR_PXI killed renamable $p2, %stack.2.p0.addr, 0 :: (store unknown-size into %ir.p0.addr, align 2) diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir index 75917ef32ae2..0ea180b20730 100644 --- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir +++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir @@ -111,7 +111,7 @@ body: | STRXui killed renamable $x1, %stack.1, 0, debug-location !8 DBG_VALUE %stack.1, $noreg, !11, !DIExpression(DW_OP_constu, 16, DW_OP_plus, DW_OP_deref), debug-location !8 - renamable $p2 = PTRUE_S 31, debug-location !DILocation(line: 4, column: 1, scope: !5) + renamable $p2 = PTRUE_S 31, implicit $vg, debug-location !DILocation(line: 4, column: 1, scope: !5) ST1W_IMM renamable $z0, renamable $p2, %stack.2, 0, debug-location !DILocation(line: 5, column: 1, scope: !5) DBG_VALUE %stack.2, $noreg, !12, !DIExpression(DW_OP_deref), debug-location !DILocation(line: 5, column: 1, scope: !5) ST1W_IMM renamable $z1, killed renamable $p2, %stack.3, 0, debug-location !DILocation(line: 6, column: 1, scope: !5) diff --git a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir index 8903ca2b865b..612453ab53f4 100644 --- a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir +++ b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir @@ -145,7 +145,7 @@ body: | liveins: $z1 ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34 - renamable $p0 = PTRUE_S 31, debug-location !34 + renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34 $x0 = ADDXri %stack.0, 0, 0, debug-location !34 ST1W_IMM renamable $z1, killed renamable $p0, %stack.0, 0, debug-location !34 :: (store unknown-size into %stack.0, align 16) $z0 = COPY renamable $z1, debug-location !34 @@ -157,7 +157,7 @@ body: | $z7 = COPY renamable $z1, debug-location !34 BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7, implicit $x0, implicit-def $sp, implicit-def $z0, implicit-def $z1, debug-location !34 ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34 - renamable $p0 = PTRUE_S 31, debug-location !34 + renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34 $z3 = IMPLICIT_DEF renamable $z1 = LD1W_IMM renamable $p0, %stack.0, 0, debug-location !34 :: (load unknown-size from %stack.0, align 16) ST1W_IMM renamable $z3, killed renamable $p0, %stack.0, 0 :: (store unknown-size into %stack.0, align 16) diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir index 3fbb7889c8b7..6063c8dfc792 100644 --- a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir +++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir @@ -48,7 +48,7 @@ body: | %2:gpr32 = COPY $w0 %1:zpr = COPY $z1 %0:zpr = COPY $z0 - %5:ppr_3b = PTRUE_B 31 + %5:ppr_3b = PTRUE_B 31, implicit $vg %6:gpr64sp = ADDXri %stack.0, 0, 0 ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16) ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16) diff --git a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir index b76fe7821b6c..8395a7619fbb 100644 --- a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir +++ b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir @@ -11,15 +11,15 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ppr = COPY $p0 - ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE + ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE implicit $vg ; CHECK-NEXT: [[UZP1_PPP_B:%[0-9]+]]:ppr = UZP1_PPP_B [[COPY]], [[PFALSE]] ; CHECK-NEXT: [[UZP1_PPP_B1:%[0-9]+]]:ppr = UZP1_PPP_B killed [[UZP1_PPP_B]], [[PFALSE]] ; CHECK-NEXT: $p0 = COPY [[UZP1_PPP_B1]] ; CHECK-NEXT: RET_ReallyLR implicit $p0 %0:ppr = COPY $p0 - %2:ppr = PFALSE + %2:ppr = PFALSE implicit $vg %3:ppr = UZP1_PPP_B %0, %2 - %4:ppr = PFALSE + %4:ppr = PFALSE implicit $vg %5:ppr = UZP1_PPP_B killed %3, %4 $p0 = COPY %5 RET_ReallyLR implicit $p0 diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir index df0e50de4d1a..ae70f91a4ec6 100644 --- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir +++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir @@ -26,7 +26,7 @@ body: | name: expand_mls_to_msb body: | bb.0: - renamable $p0 = PTRUE_B 31 + renamable $p0 = PTRUE_B 31, implicit $vg renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1 RET_ReallyLR implicit $z0 ... @@ -36,7 +36,7 @@ body: | name: expand_mla_to_mad body: | bb.0: - renamable $p0 = PTRUE_B 31 + renamable $p0 = PTRUE_B 31, implicit $vg renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1 RET_ReallyLR implicit $z0 ... diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir index 81318aa5c2a5..5169113697dc 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir @@ -174,7 +174,7 @@ body: | %1:zpr = COPY $z0 %0:ppr_3b = COPY $p0 %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv - %3:ppr = PTRUE_B 31 + %3:ppr = PTRUE_B 31, implicit $vg PTEST_PP killed %3, killed %2, implicit-def $nzcv %4:gpr32 = COPY $wzr %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv @@ -409,14 +409,14 @@ body: | ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_not_all_active ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv - ; CHECK-NEXT: %3:ppr = PTRUE_B 0 + ; CHECK-NEXT: %3:ppr = PTRUE_B 0, implicit $vg ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv ; CHECK-NEXT: %4:gpr32 = COPY $wzr ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:zpr = COPY $z0 %0:ppr_3b = COPY $p0 %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv - %3:ppr = PTRUE_B 0 + %3:ppr = PTRUE_B 0, implicit $vg PTEST_PP killed %3, killed %2, implicit-def $nzcv %4:gpr32 = COPY $wzr %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv @@ -446,14 +446,14 @@ body: | ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_of_halfs ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv - ; CHECK-NEXT: %3:ppr = PTRUE_H 31 + ; CHECK-NEXT: %3:ppr = PTRUE_H 31, implicit $vg ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv ; CHECK-NEXT: %4:gpr32 = COPY $wzr ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:zpr = COPY $z0 %0:ppr_3b = COPY $p0 %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv - %3:ppr = PTRUE_H 31 + %3:ppr = PTRUE_H 31, implicit $vg PTEST_PP killed %3, killed %2, implicit-def $nzcv %4:gpr32 = COPY $wzr %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir index 8f7467d99154..c1d9dfff7344 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEGE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEGE_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEGE_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEGE_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEGE_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEGE_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEGE_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 0 + %2:ppr = PTRUE_B 0, implicit $vg %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir index 217d984560e3..c6df21f85db7 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEGT_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg %3:ppr = WHILEGT_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg %4:ppr = WHILEGT_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 1 + %2:ppr = PTRUE_H 1, implicit $vg, implicit $vg %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir index 8d6f466c6b73..7d8aed3c325a 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHI_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHI_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEHI_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEHI_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEHI_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEHI_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEHI_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 29 + %2:ppr = PTRUE_S 29, implicit $vg %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir index da76a30f843b..f4dbfbc3db1c 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHS_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHS_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEHS_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEHS_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEHS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEHS_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEHS_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 30 + %2:ppr = PTRUE_D 30, implicit $vg %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir index 32954d593c1d..dc2265490cb5 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELE_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELE_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELE_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELE_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELE_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELE_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELE_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 7 + %2:ppr = PTRUE_B 7, implicit $vg %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir index cca0ab8ef210..4d66e3e57da8 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELO_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELO_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELO_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELO_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELO_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELO_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELO_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 6 + %2:ppr = PTRUE_H 6, implicit $vg %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir index 4bae3a1986f4..ea02f8c70ef8 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELS_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELS_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELS_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELS_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELS_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELS_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELS_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 5 + %2:ppr = PTRUE_S 5, implicit $vg %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir index 3c6a9e21b4c6..d08781f203e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELT_PWW_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -63,7 +63,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELT_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -98,7 +98,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELT_PWW_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -133,7 +133,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILELT_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -168,7 +168,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELT_PWW_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -203,7 +203,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILELT_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -238,7 +238,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -273,7 +273,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILELT_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_D 4 + %2:ppr = PTRUE_D 4, implicit $vg %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -353,7 +353,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -393,7 +393,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -433,7 +433,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr32 = COPY $w1 %0:gpr32 = COPY $w0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir index 27cdf593df77..d800009b9537 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -65,7 +65,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILERW_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -100,7 +100,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILERW_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -135,7 +135,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILERW_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -175,7 +175,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 0 + %2:ppr = PTRUE_B 0, implicit $vg %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -215,7 +215,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -255,7 +255,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -295,7 +295,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir index 3b49b1ec2c80..9f8b7c3197ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir @@ -30,7 +30,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 31 + %2:ppr = PTRUE_B 31, implicit $vg %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -65,7 +65,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %4:ppr = WHILEWR_PXX_H %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -100,7 +100,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %4:ppr = WHILEWR_PXX_S %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -135,7 +135,7 @@ body: | ; CHECK-NOT: PTEST %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %4:ppr = WHILEWR_PXX_D %0, %1, implicit-def dead $nzcv PTEST_PP %2, %4, implicit-def $nzcv %6:gpr32 = COPY $wzr @@ -175,7 +175,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_B 0 + %2:ppr = PTRUE_B 0, implicit $vg %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -215,7 +215,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_H 31 + %2:ppr = PTRUE_H 31, implicit $vg %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -255,7 +255,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_S 31 + %2:ppr = PTRUE_S 31, implicit $vg %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr @@ -295,7 +295,7 @@ body: | ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv %1:gpr64 = COPY $x1 %0:gpr64 = COPY $x0 - %2:ppr = PTRUE_D 31 + %2:ppr = PTRUE_D 31, implicit $vg %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv PTEST_PP killed %2, killed %3, implicit-def $nzcv %4:gpr32 = COPY $wzr -- Gitee From 4aa36974db0c0e128444abfcf58db514d8e23f3f Mon Sep 17 00:00:00 2001 From: Dani Date: Tue, 12 Mar 2024 12:36:05 +0100 Subject: [PATCH 71/77] [AArch64][SME] Add BTI and No Exec Stack markers to sme-abi.S (#84895) Adding BTI landing pads compiler-rt is built with -mbranch-protectoin. Tabulators are changed to 2 spaces for consistency. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- compiler-rt/lib/builtins/aarch64/sme-abi.S | 34 ++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index d470ecaf7aaa..4c0ff66931db 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -26,9 +26,10 @@ // abort(). Note that there is no need to preserve any state before the call, // because the function does not return. DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) -.cfi_startproc - .variant_pcs SYMBOL_NAME(do_abort) - stp x29, x30, [sp, #-32]! + .cfi_startproc + .variant_pcs SYMBOL_NAME(do_abort) + BTI_C + stp x29, x30, [sp, #-32]! cntd x0 // Store VG to a stack location that we describe with .cfi_offset str x0, [sp, #16] @@ -36,22 +37,23 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) .cfi_offset w30, -24 .cfi_offset w29, -32 .cfi_offset 46, -16 - bl __arm_sme_state - tbz x0, #0, 2f + bl __arm_sme_state + tbz x0, #0, 2f 1: - smstop sm + smstop sm 2: // We can't make this into a tail-call because the unwinder would // need to restore the value of VG. - bl SYMBOL_NAME(abort) -.cfi_endproc + bl SYMBOL_NAME(abort) + .cfi_endproc END_COMPILERRT_FUNCTION(do_abort) // __arm_sme_state fills the result registers based on a local // that is set as part of the compiler-rt startup code. // __aarch64_has_sme_and_tpidr2_el0 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) - .variant_pcs __arm_sme_state + .variant_pcs __arm_sme_state + BTI_C mov x0, xzr mov x1, xzr @@ -68,7 +70,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) - .variant_pcs __arm_tpidr2_restore + .variant_pcs __arm_tpidr2_restore + BTI_C // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific // manner. mrs x14, TPIDR2_EL0 @@ -103,7 +106,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) - .variant_pcs __arm_tpidr2_restore + .variant_pcs __arm_tpidr2_restore + BTI_C // If the current thread does not have access to TPIDR2_EL0, the subroutine // does nothing. adrp x14, TPIDR2_SYMBOL @@ -143,7 +147,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save) DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) - .variant_pcs __arm_tpidr2_restore + .variant_pcs __arm_tpidr2_restore + BTI_C // If the current thread does not have access to SME, the subroutine does // nothing. adrp x14, TPIDR2_SYMBOL @@ -174,3 +179,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) 0: ret END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable) + +NO_EXEC_STACK_DIRECTIVE + +// GNU property note for BTI and PAC +GNU_PROPERTY_BTI_PAC -- Gitee From ef56f6e64aa9c5a3e6fbdd3c4a51421c8d23f64c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 13 Mar 2024 08:21:33 +0000 Subject: [PATCH 72/77] [AArch64][SME] Don't mark 'smstart za' as using/defining VG. (#84775) VG is only used/defined when changing the streaming mode, using 'smstart sm' or plainly 'smstart' (same for smstop). Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 12 +++++++++- llvm/lib/Target/AArch64/SMEInstrFormats.td | 2 -- llvm/test/CodeGen/AArch64/sme-write-vg.ll | 24 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-write-vg.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e15153c62857..462991011456 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7216,7 +7216,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // register allocator to pass call args in callee saved regs, without extra // copies to avoid these fake clobbers of actually-preserved GPRs. if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 || - MI.getOpcode() == AArch64::MSRpstatePseudo) + MI.getOpcode() == AArch64::MSRpstatePseudo) { for (unsigned I = MI.getNumOperands() - 1; I > 0; --I) if (MachineOperand &MO = MI.getOperand(I); MO.isReg() && MO.isImplicit() && MO.isDef() && @@ -7224,6 +7224,16 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, AArch64::GPR64RegClass.contains(MO.getReg()))) MI.removeOperand(I); + // The SVE vector length can change when entering/leaving streaming mode. + if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM || + MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) { + MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, + /*IsImplicit=*/true)); + MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true, + /*IsImplicit=*/true)); + } + } + // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that // have nothing to do with VG, were it not that they are used to materialise a // frame-address. If they contain a frame-index to a scalable vector, this diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 04ae6d79f384..835e74fdbc64 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -222,8 +222,6 @@ def MSRpstatesvcrImm1 let Inst{8} = imm; let Inst{7-5} = 0b011; // op2 let hasPostISelHook = 1; - let Uses = [VG]; - let Defs = [VG]; } def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; diff --git a/llvm/test/CodeGen/AArch64/sme-write-vg.ll b/llvm/test/CodeGen/AArch64/sme-write-vg.ll new file mode 100644 index 000000000000..577606d45484 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-write-vg.ll @@ -0,0 +1,24 @@ +; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s + +target triple = "aarch64" + +; Check that we don't define VG for 'smstart za' and 'smstop za' +define void @smstart_za() "aarch64_new_za" nounwind { + ; CHECK-LABEL: name: smstart_za + ; CHECK-NOT: implicit-def {{[^,]*}}$vg + ret void +} + +; Check that we do define VG for 'smstart sm' and 'smstop sm' +define void @smstart_sm() nounwind { + ; CHECK-LABEL: name: smstart_sm + ; CHECK: MSRpstatesvcrImm1 1, 1, + ; CHECK-SAME: implicit-def {{[^,]*}}$vg + ; CHECK: MSRpstatesvcrImm1 1, 0, + ; CHECK-SAME: implicit-def {{[^,]*}}$vg + call void @require_sm() + ret void +} + +declare void @require_sm() "aarch64_pstate_sm_enabled" +declare void @require_za() "aarch64_inout_za" -- Gitee From b53f09516af74009be167eb65dbd1209bbafa5cf Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 18 Mar 2024 09:43:03 +0000 Subject: [PATCH 73/77] [AArch64][SME] Make coalescer barrier available without +sme. (#85311) For each call that changes the streaming-mode ISel inserts a COALESCER_BARRIER node for the FP and (non-scalable) vector arguments to the callee. When calling a non-streaming function from a streaming-compatible function, it's not required to have +sme (in case the SME code-path is not actually executed at runtime). The patterns to match the COALESCER_BARRIER however were still predicated with `HasSME`, which is incorrect. This patch tries to fix that. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +- ...compatible-to-normal-fn-wihout-sme-attr.ll | 39 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index b350638c9ef3..1bab548e424f 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -186,6 +186,8 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; +} // End let Predicates = [HasSME] + multiclass CoalescerBarrierPseudo vts> { def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> { let Constraints = "$dst = $src"; @@ -205,8 +207,6 @@ multiclass CoalescerBarriers { defm COALESCER_BARRIER : CoalescerBarriers; -} // End let Predicates = [HasSME] - // Pseudo to match to smstart/smstop. This expands: // // pseudonode (pstate_za|pstate_sm), before_call, expected_value diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll index 3fa1ee5b9b01..dba3227459b9 100644 --- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -38,4 +38,43 @@ define void @streaming_compatible() #0 { declare void @non_streaming() + +; Verify that COALESCER_BARRIER is also supported without +sme. + +define void @streaming_compatible_arg(float %f) #0 { +; CHECK-LABEL: streaming_compatible_arg: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: tbz w19, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl non_streaming +; CHECK-NEXT: tbz w19, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + call void @non_streaming(float %f) + ret void +} + + attributes #0 = { nounwind "aarch64_pstate_sm_compatible" } -- Gitee From 881ec5d3d21de19e0b8355dd5f2eee3210bf01fa Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 26 Mar 2024 11:40:31 +0000 Subject: [PATCH 74/77] [AArch64][SME] Add coalescer barrier for args/results in locally streaming functions. (#85388) Similar to how we protected FP/fixed-vector arguments and results from calls, we should do the same for arguments/results from locally-streaming functions such that those are not spilled/filled as ZPR registers. This may cause a small regression (additional spills/fills), which is addressed by #85386. Signed-off-by: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Signed-off-by: chenmiao --- .../Target/AArch64/AArch64ISelLowering.cpp | 21 ++- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +- .../sme-avoid-coalescing-locally-streaming.ll | 120 ++++++++++++++++++ ...ing-body-streaming-compatible-interface.ll | 26 ++-- .../CodeGen/AArch64/sme-streaming-body.ll | 10 +- 5 files changed, 158 insertions(+), 21 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 462991011456..b7c857b72a54 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6369,6 +6369,11 @@ AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, return TPIDR2Obj; } +static bool isPassedInFPR(EVT VT) { + return VT.isFixedLengthVector() || + (VT.isFloatingPoint() && !VT.isScalableVector()); +} + SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, @@ -6503,6 +6508,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // This will be the new Chain/Root node. ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); Glue = ArgValue.getValue(2); + if (isPassedInFPR(ArgValue.getValueType())) { + ArgValue = + DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, + DAG.getVTList(ArgValue.getValueType(), MVT::Glue), + {ArgValue, Glue}); + Glue = ArgValue.getValue(1); + } } else ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); @@ -6869,11 +6881,6 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, } } -static bool isPassedInFPR(EVT VT) { - return VT.isFixedLengthVector() || - (VT.isFloatingPoint() && !VT.isScalableVector()); -} - /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( @@ -8030,6 +8037,10 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector RetOps(1, Chain); for (auto &RetVal : RetVals) { + if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() && + isPassedInFPR(RetVal.second.getValueType())) + RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, + RetVal.second.getValueType(), RetVal.second); Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue); Glue = Chain.getValue(1); RetOps.push_back( diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 1bab548e424f..5b4070aaeea9 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -23,7 +23,7 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3, [SDNPHasChain, SDNPSideEffect, SDNPVariadic, SDNPOptInGlue]>; def AArch64CoalescerBarrier - : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>; + : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; //===----------------------------------------------------------------------===// // Instruction naming conventions. diff --git a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll new file mode 100644 index 000000000000..cd5046a9a647 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK-COALESCER-BARRIER +; RUN: llc -mattr=+sme -stop-after=virtregrewriter < %s | FileCheck %s --check-prefix=CHECK-REGALLOC + +target triple = "aarch64" + +define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { + ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_args + ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0): + ; CHECK-COALESCER-BARRIER-NEXT: liveins: $q0 + ; CHECK-COALESCER-BARRIER-NEXT: {{ $}} + ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]] + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF + ; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: $z0 = COPY [[INSERT_SUBREG]] + ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR + ; + ; CHECK-REGALLOC-LABEL: name: dont_coalesce_args + ; CHECK-REGALLOC: bb.0 (%ir-block.0): + ; CHECK-REGALLOC-NEXT: liveins: $q0 + ; CHECK-REGALLOC-NEXT: {{ $}} + ; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0 + ; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0) + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) + ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0 + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: RET_ReallyLR + %sa = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %a, i64 0) + call void @scalable_args( %sa) + ret void +} + +define <2 x i64> @dont_coalesce_res() "aarch64_pstate_sm_body" nounwind { + ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_res + ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0): + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-COALESCER-BARRIER-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY [[COPY]].zsub + ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY1]] + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_]] + ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0 + ; + ; CHECK-REGALLOC-LABEL: name: dont_coalesce_res + ; CHECK-REGALLOC: bb.0 (%ir-block.0): + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL renamable $q0, implicit killed $z0 + ; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0 + ; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0) + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) + ; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0 + %sa = call @scalable_res() + %res = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( %sa, i64 0) + ret <2 x i64> %res +} + +define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { + ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_arg_that_is_also_res + ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0): + ; CHECK-COALESCER-BARRIER-NEXT: liveins: $q0 + ; CHECK-COALESCER-BARRIER-NEXT: {{ $}} + ; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]] + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF + ; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: $z0 = COPY [[INSERT_SUBREG]] + ; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_1:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COALESCER_BARRIER_FPR128_]] + ; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg + ; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_1]] + ; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0 + ; + ; CHECK-REGALLOC-LABEL: name: dont_coalesce_arg_that_is_also_res + ; CHECK-REGALLOC: bb.0 (%ir-block.0): + ; CHECK-REGALLOC-NEXT: liveins: $q0 + ; CHECK-REGALLOC-NEXT: {{ $}} + ; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0 + ; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0) + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) + ; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0 + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) + ; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0 + ; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0) + ; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg + ; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0) + ; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0 + %sa = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %a, i64 0) + call void @scalable_args( %sa) + ret <2 x i64> %a +} + +declare void @scalable_args() "aarch64_pstate_sm_enabled" +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) + +declare @scalable_res() "aarch64_pstate_sm_enabled" +declare <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(, i64) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll index d67573384ca9..6e262cc0786e 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -8,27 +8,31 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: sm_body_sm_compatible_simple: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x8, x0, #0x1 ; CHECK-NEXT: tbnz w8, #0, .LBB0_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: fmov s0, wzr +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: tbnz w8, #0, .LBB0_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ret float zeroinitializer } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 91accad7f2fd..4d8980c84a05 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -94,17 +94,19 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, #41 // =0x29 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret -- Gitee From f0bf48d92237da1dbdc6eb7c1c732402fc03f249 Mon Sep 17 00:00:00 2001 From: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Date: Wed, 12 Feb 2025 10:03:23 +0800 Subject: [PATCH 75/77] [SME] Fix backport conflict for SME feature backport Signed-off-by: chenmiao --- clang/lib/Sema/Sema.cpp | 4 +- llvm/lib/Target/AArch64/SMEABIPass.cpp | 9 +- .../AArch64/Utils/AArch64SMEAttributes.h | 2 +- llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir | 44 --- llvm/test/CodeGen/AArch64/preserve.ll | 4 +- ...compatible-to-normal-fn-wihout-sme-attr.ll | 2 +- .../AArch64/sme-disable-gisel-fisel.ll | 52 ++-- .../CodeGen/AArch64/sme-intrinsics-loads.ll | 18 +- .../CodeGen/AArch64/sme-intrinsics-stores.ll | 10 +- .../CodeGen/AArch64/sme-lazy-save-call.ll | 38 +-- ...ate-sm-changing-call-disable-coalescing.ll | 261 +++++++++--------- .../AArch64/sme-shared-za-interface.ll | 24 +- ...ing-body-streaming-compatible-interface.ll | 4 +- .../CodeGen/AArch64/sme-streaming-body.ll | 12 +- .../sme-streaming-compatible-interface.ll | 12 +- .../AArch64/sme-streaming-interface.ll | 6 +- ...nging-call-disable-stackslot-scavenging.ll | 4 +- ...streaming-mode-fixed-length-masked-load.ll | 12 +- .../Inline/AArch64/sme-pstatesm-attrs.ll | 2 + llvm/test/Verifier/sme-attributes.ll | 30 -- 20 files changed, 237 insertions(+), 313 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 46ae6fba8344..03030a02ee79 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -2057,8 +2057,8 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) { if (Ty->isSVESizelessBuiltinType() && FD && FD->hasBody()) { llvm::StringMap CallerFeatureMap; Context.getFunctionFeatureMap(CallerFeatureMap, FD); - if (!Builtin::evaluateRequiredTargetFeatures( - "sve", CallerFeatureMap)) + if (!Builtin::evaluateRequiredTargetFeatures("sve", CallerFeatureMap) && + !Builtin::evaluateRequiredTargetFeatures("sme", CallerFeatureMap)) Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty; } }; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 2b713fe28d83..8ed33602a96f 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -139,13 +139,6 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, Builder.getInt32(0xff)); } - if (FnAttrs.isNewZT0()) { - Function *ClearZT0Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); - Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, - {Builder.getInt32(0)}); - } - if (FnAttrs.hasPrivateZAInterface()) { // Before returning, disable pstate.za for (BasicBlock &BB : *F) { @@ -173,7 +166,7 @@ bool SMEABI::runOnFunction(Function &F) { bool Changed = false; SMEAttrs FnAttrs(F); - if (FnAttrs.isNewZA() || FnAttrs.isNewZT0()) + if (FnAttrs.isNewZA()) Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); return Changed; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 8214fda99fb1..b57e2176b020 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -94,7 +94,7 @@ public: return State == StateValue::In || State == StateValue::Out || State == StateValue::InOut || State == StateValue::Preserved; } - bool hasSharedZAInterface() const { return sharesZA() || sharesZT0(); } + bool hasSharedZAInterface() const { return sharesZA(); } bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); } bool hasZAState() const { return isNewZA() || sharesZA(); } bool requiresLazySave(const SMEAttrs &Callee) const { diff --git a/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir b/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir deleted file mode 100644 index 5b1e24ea732f..000000000000 --- a/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir +++ /dev/null @@ -1,44 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -o - %s -mtriple=aarch64 -verify-machineinstrs -run-pass=postrapseudos -mattr=+sme2 | FileCheck %s ---- -name: pnr_to_ppr -alignment: 4 -tracksRegLiveness: true -tracksDebugUserValues: true -frameInfo: - maxAlignment: 1 - maxCallFrameSize: 0 -machineFunctionInfo: - hasRedZone: false -body: | - bb.0: - ; CHECK-LABEL: name: pnr_to_ppr - ; CHECK: renamable $pn8 = PTRUE_C_D - ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0 - ; CHECK-NEXT: RET_ReallyLR implicit killed $p0 - renamable $pn8 = PTRUE_C_D - $p0 = COPY killed renamable $pn8 - RET_ReallyLR implicit killed $p0 - -... ---- -name: ppr_to_pnr -alignment: 4 -tracksRegLiveness: true -tracksDebugUserValues: true -frameInfo: - maxAlignment: 1 - maxCallFrameSize: 0 -machineFunctionInfo: - hasRedZone: false -body: | - bb.0: - ; CHECK-LABEL: name: ppr_to_pnr - ; CHECK: renamable $p8 = PTRUE_H 31 - ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0 - ; CHECK-NEXT: RET_ReallyLR implicit killed $pn0 - renamable $p8 = PTRUE_H 31 - $pn0 = COPY killed renamable $p8 - RET_ReallyLR implicit killed $pn0 - -... diff --git a/llvm/test/CodeGen/AArch64/preserve.ll b/llvm/test/CodeGen/AArch64/preserve.ll index f95de60fbb24..7f57420a5fa1 100644 --- a/llvm/test/CodeGen/AArch64/preserve.ll +++ b/llvm/test/CodeGen/AArch64/preserve.ll @@ -4,13 +4,13 @@ target triple = "aarch64-unknown-unknown" declare void @bar1() define preserve_mostcc void @baz() #0 { -; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 +; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 call void @bar1() call void @bar2() ret void } define preserve_allcc void @foo() #0 { -; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 +; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15 call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll index dba3227459b9..ec3c972483b4 100644 --- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll +++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll @@ -52,8 +52,8 @@ define void @streaming_compatible_arg(float %f) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 86ee85ad50f9..94a711ba1c71 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -76,11 +76,11 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl normal_callee -; CHECK-COMMON-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 @@ -284,15 +284,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: msub x8, x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: sub x10, x29, #16 ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za @@ -346,20 +346,20 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: msub x8, x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: sub x10, x29, #16 ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbnz x8, .LBB10_2 ; CHECK-COMMON-NEXT: // %bb.1: ; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore @@ -386,14 +386,15 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw ; CHECK-COMMON-NEXT: smstop sm ; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl fmodf -; CHECK-COMMON-NEXT: str s0, [sp, #76] // 4-byte Folded Spill +; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm -; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr s0, [sp, #76] // 4-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #96 ; CHECK-COMMON-NEXT: ret %res = frem float %a, %b ret float %res @@ -411,8 +412,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: bl __arm_sme_state -; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: and x19, x0, #0x1 +; CHECK-COMMON-NEXT: ldr s2, [sp, #8] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 ; CHECK-COMMON-NEXT: // %bb.1: @@ -426,11 +428,11 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: smstart sm ; CHECK-COMMON-NEXT: .LBB12_4: ; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-COMMON-NEXT: add sp, sp, #96 ; CHECK-COMMON-NEXT: ret %res = frem float %a, %b diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index a286ca4965e1..985b0581200f 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -352,14 +352,14 @@ entry: define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { ; CHECK-LABEL: ldr_with_off_many_imm_15_18: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: add w13, w0, #16 ; CHECK-NEXT: ldr za[w12, 15], [x1, #15, mul vl] -; CHECK-NEXT: add w12, w0, #16 -; CHECK-NEXT: ldr za[w12, 0], [x8] -; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] -; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w13, 0], [x8] +; CHECK-NEXT: ldr za[w13, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w13, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15) @@ -395,11 +395,11 @@ define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: add w12, w0, #16 ; CHECK-NEXT: add x9, x1, x8, lsl #4 ; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: add w13, w0, #32 ; CHECK-NEXT: ldr za[w12, 15], [x9, #15, mul vl] -; CHECK-NEXT: add w12, w0, #32 -; CHECK-NEXT: ldr za[w12, 0], [x8] -; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] -; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w13, 0], [x8] +; CHECK-NEXT: ldr za[w13, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w13, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index 36d72b7a7abb..366e24df3e8c 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -359,11 +359,11 @@ define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: add x8, x1, x8, lsl #4 +; CHECK-NEXT: add w13, w0, #16 ; CHECK-NEXT: str za[w12, 15], [x1, #15, mul vl] -; CHECK-NEXT: add w12, w0, #16 -; CHECK-NEXT: str za[w12, 0], [x8] -; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] -; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w13, 0], [x8] +; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15) @@ -397,9 +397,9 @@ define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add w12, w0, #16 -; CHECK-NEXT: add w13, w0, #32 ; CHECK-NEXT: add x9, x1, x8, lsl #4 ; CHECK-NEXT: add x8, x1, x8, lsl #5 +; CHECK-NEXT: add w13, w0, #32 ; CHECK-NEXT: str za[w12, 15], [x9, #15, mul vl] ; CHECK-NEXT: str za[w13, 0], [x8] ; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 2cd03fac0999..a3c3089d5425 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -11,15 +11,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -45,8 +45,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x19, #1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x19, #1 ; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 @@ -91,15 +91,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za @@ -129,15 +129,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #80 ; CHECK-NEXT: stur wzr, [x29, #-68] ; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur x8, [x29, #-80] +; CHECK-NEXT: sturh w9, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index d5bea725b6d1..6035e2e62587 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -28,15 +28,15 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, i8 %arg, i32 0 @@ -61,15 +61,15 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, i16 %arg, i32 0 @@ -94,15 +94,15 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, i32 %arg, i32 0 @@ -127,15 +127,15 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, i64 %arg, i32 0 @@ -155,8 +155,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -165,17 +165,17 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, half %arg, i32 0 @@ -195,8 +195,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -205,17 +205,17 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, float %arg, i32 0 @@ -235,8 +235,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -245,17 +245,17 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = insertelement poison, double %arg, i32 0 @@ -280,8 +280,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -290,17 +290,17 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x i8> %arg, i32 0 @@ -321,8 +321,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -331,17 +331,17 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x i16> %arg, i32 0 @@ -362,8 +362,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -372,17 +372,17 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x i32> %arg, i32 0 @@ -403,8 +403,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -413,17 +413,17 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x i64> %arg, i32 0 @@ -444,8 +444,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -454,17 +454,17 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x half> %arg, i32 0 @@ -485,8 +485,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -495,17 +495,17 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x float> %arg, i32 0 @@ -526,8 +526,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -536,17 +536,17 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %elt = extractelement <1 x double> %arg, i32 0 @@ -571,8 +571,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -581,17 +581,17 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %arg, i64 0) @@ -611,8 +611,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -621,17 +621,17 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %arg, i64 0) @@ -651,8 +651,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -661,17 +661,17 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %arg, i64 0) @@ -691,8 +691,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -701,17 +701,17 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %arg, i64 0) @@ -731,8 +731,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -741,17 +741,17 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %arg, i64 0) @@ -771,8 +771,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -781,17 +781,17 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv8bf16.v8bf16( poison, <8 x bfloat> %arg, i64 0) @@ -811,8 +811,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -821,17 +821,17 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %arg, i64 0) @@ -851,8 +851,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -861,17 +861,17 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %arg, i64 0) @@ -894,16 +894,16 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: and z1.b, z1.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i1 @@ -914,10 +914,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %vec = call @llvm.vector.insert.nxv8i1.v8i1( poison, <8 x i1> %arg, i64 0) @@ -942,13 +942,13 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i8 @get_i8() @@ -969,13 +969,13 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i16 @get_i16() @@ -996,13 +996,13 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i32 @get_i32() @@ -1023,13 +1023,13 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i64 @get_i64() @@ -1052,15 +1052,15 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call half @get_f16() @@ -1083,15 +1083,15 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_f32 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call float @get_f32() @@ -1114,15 +1114,15 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call double @get_f64() @@ -1149,15 +1149,15 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i8 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i8> @get_v1i8() @@ -1181,15 +1181,15 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i16 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i16> @get_v1i16() @@ -1213,15 +1213,15 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i32> @get_v1i32() @@ -1245,15 +1245,15 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1i64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i64> @get_v1i64() @@ -1277,15 +1277,15 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1f16 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x half> @get_v1f16() @@ -1309,15 +1309,15 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1f32 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x float> @get_v1f32() @@ -1341,15 +1341,15 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v1f64 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x double> @get_v1f64() @@ -1377,15 +1377,15 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v16i8 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <16 x i8> @get_v16i8() @@ -1408,15 +1408,15 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v8i16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x i16> @get_v8i16() @@ -1439,15 +1439,15 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v4i32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x i32> @get_v4i32() @@ -1470,15 +1470,15 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v2i64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x i64> @get_v2i64() @@ -1501,15 +1501,15 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v8f16 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x half> @get_v8f16() @@ -1532,15 +1532,15 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v4f32 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x float> @get_v4f32() @@ -1563,15 +1563,15 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-NEXT: bl get_v2f64 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x double> @get_v2f64() @@ -1635,6 +1635,7 @@ declare @llvm.vector.insert.nxv4i32.v4i32(, declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme" } diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index c885fa4a76ec..49d7ae006bb1 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -10,15 +10,15 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -43,15 +43,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll index 6e262cc0786e..a50e44892c61 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -27,11 +27,11 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ret float zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 4d8980c84a05..384e7dc94f78 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -94,19 +94,17 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: add z0.d, z0.d, #41 // =0x29 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret @@ -327,9 +325,9 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret tail call void @streaming_compatible_callee(); diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index fb3cdbb39865..a99818af3cfb 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -466,22 +466,24 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: .cfi_offset b15, -80 ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: ldp s4, s0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr s4, [sp, #8] // 4-byte Folded Reload ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: stp s4, s0, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: ldp d4, d0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d4, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: stp d4, d0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB10_2: // %entry -; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: mov x0, x9 ; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov x0, x9 ; CHECK-NEXT: mov x1, x8 +; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl bar ; CHECK-NEXT: tbz w19, #0, .LBB10_4 ; CHECK-NEXT: // %bb.3: // %entry diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 59f1de77d4e0..9270a6392c41 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -396,7 +396,7 @@ entry: ret i8 %vecext } -define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 { +define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #1 { ; CHECK-LABEL: call_to_non_streaming_pass_args: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #112 @@ -408,15 +408,15 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: stp s1, s0, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: stp d3, d2, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp s1, s0, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: ldp d3, d2, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: ldp s1, s0, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: bl bar ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index f56484e82157..b0cddf6dbb60 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -20,8 +20,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill @@ -32,10 +32,10 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %ptr = alloca diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index a8301caf8695..050c7353b97d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -340,12 +340,12 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] ; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w3, [sp, #12] ; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: strh w1, [sp, #8] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: ldr d1, [sp, #8] ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 @@ -366,12 +366,12 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w3, [sp, #12] ; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: strh w1, [sp, #8] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: ldr d1, [sp, #8] ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll index 3aca46fea04c..580fabfd00cb 100644 --- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll +++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll @@ -55,6 +55,8 @@ entry: ret i32 %res } +declare i32 @llvm.vscale() + define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee ; CHECK-SAME: () #[[ATTR4:[0-9]+]] { diff --git a/llvm/test/Verifier/sme-attributes.ll b/llvm/test/Verifier/sme-attributes.ll index 3d01613ebf2f..c992cd7adcd8 100644 --- a/llvm/test/Verifier/sme-attributes.ll +++ b/llvm/test/Verifier/sme-attributes.ll @@ -32,33 +32,3 @@ declare void @za_in_out() "aarch64_in_za" "aarch64_out_za"; declare void @za_inout_out() "aarch64_inout_za" "aarch64_out_za"; ; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive - -declare void @zt0_new_preserved() "aarch64_new_zt0" "aarch64_preserves_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_new_in() "aarch64_new_zt0" "aarch64_in_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_new_inout() "aarch64_new_zt0" "aarch64_inout_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_new_out() "aarch64_new_zt0" "aarch64_out_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_preserved_in() "aarch64_preserves_zt0" "aarch64_in_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_preserved_inout() "aarch64_preserves_zt0" "aarch64_inout_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_preserved_out() "aarch64_preserves_zt0" "aarch64_out_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_in_inout() "aarch64_in_zt0" "aarch64_inout_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_in_out() "aarch64_in_zt0" "aarch64_out_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive - -declare void @zt0_inout_out() "aarch64_inout_zt0" "aarch64_out_zt0"; -; CHECK: Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', 'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive -- Gitee From b97d74bdb2e61b6a4727271728353c143baf9581 Mon Sep 17 00:00:00 2001 From: chenmiao <15273704+chenmiao32@user.noreply.gitee.com> Date: Sat, 22 Feb 2025 14:57:13 +0800 Subject: [PATCH 76/77] [SME] build compiler-rt in RUNTIME instead of PROJECT If build compiler-rt in PROJECT part, it will built by gcc/g++ which will lead to test fail for SME ABI and builtins. After change, it will use clang to build. Signed-off-by: chenmiao --- build.sh | 41 ++++++++++++++++++++++--- compiler-rt/lib/builtins/CMakeLists.txt | 2 +- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/build.sh b/build.sh index 6569b933b006..5c47c21e440e 100755 --- a/build.sh +++ b/build.sh @@ -10,7 +10,7 @@ enable_autotuner="1" buildtype=RelWithDebInfo backends="all" build_for_openeuler="0" -enabled_projects="clang;lld;compiler-rt;openmp;clang-tools-extra" +enabled_projects="clang;lld;openmp;clang-tools-extra" embedded_toolchain="0" split_dwarf=on use_ccache="0" @@ -322,7 +322,7 @@ fi if [ $embedded_toolchain == "1" ]; then echo "Build for embedded cross tool chain" - enabled_projects="clang;lld;compiler-rt;" + enabled_projects="clang;lld;" CMAKE_OPTIONS="$CMAKE_OPTIONS \ -DLLVM_BUILD_FOR_EMBEDDED=ON" fi @@ -364,14 +364,14 @@ mkdir -p "$build_prefix" && cd "$build_prefix" cmake $CMAKE_OPTIONS \ -DCOMPILER_RT_BUILD_SANITIZERS=on \ -DLLVM_ENABLE_PROJECTS=$enabled_projects \ - -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \ + -DLLVM_ENABLE_RUNTIMES="compiler-rt;libunwind" \ -DLLVM_USE_LINKER=gold \ -DLLVM_LIT_ARGS="-sv -j$threads" \ -DLLVM_USE_SPLIT_DWARF=$split_dwarf \ -DCMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO="-Wl,--gdb-index -Wl,--compress-debug-sections=zlib" \ -DCMAKE_EXE_LINKER_FLAGS_DEBUG="-Wl,--gdb-index -Wl,--compress-debug-sections=zlib" \ -DBUILD_SHARED_LIBS=OFF \ - -DLLVM_ENABLE_LIBCXX=OFF \ + -DLLVM_STATIC_LINK_CXX_STDLIB=ON \ -DLLVM_ENABLE_ZLIB=ON \ -DLLVM_BUILD_RUNTIME=ON \ -DLLVM_INCLUDE_TOOLS=ON \ @@ -409,6 +409,39 @@ if [ $do_install == "1" ]; then make -j$threads $verbose $install fi +# build libcxx/libcxxabi with the just-built clang/clang++ +c_compiler="$install_prefix/bin/clang" +cxx_compiler="$install_prefix/bin/clang++" +if pushd runtimes > /dev/null 2>&1; then + if [ ! -f "$build_prefix"/projects/libcxx/CMakeCache.txt ]; then + mkdir -p "$build_prefix/projects/libcxx" && cd "$build_prefix/projects/libcxx" + cmake -Wno-dev \ + -DCMAKE_BUILD_TYPE=$buildtype \ + -DCMAKE_INSTALL_PREFIX="$install_prefix" \ + -DCMAKE_C_COMPILER="$c_compiler" \ + -DCMAKE_CXX_COMPILER="$cxx_compiler" \ + -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ + -DLLVM_LIT_ARGS="-sv -j$threads" \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_SKIP_RPATH=ON \ + -DLLVM_USE_LINKER=gold \ + -DLLVM_USE_SPLIT_DWARF=$split_dwarf \ + ../../../runtimes + else + cd "$build_prefix"/projects/libcxx + fi + install_libcxx=${install/\/strip/-strripped} + make -j$threads $verbose \ + ${install_libcxx/install/install-cxx} ${install_libcxx/install/install-cxxabi} ${install_libcxx/install/install-cxxabi-headers} + if [ -n "$unit_test" ]; then + make -j$threads $verbose ${unit_test/all/cxx} ${unit_test/all/cxxabi} + fi + popd > /dev/null 2>&1 +else + echo "$0: directory not found: libcxx" + exit 1 +fi + if [ -n "$unit_test" ]; then make -j$threads $verbose check-all fi diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index d08bdd1959b8..cf376b4a021c 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -555,7 +555,7 @@ set(aarch64_SOURCES aarch64/fp_mode.c ) -if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD)) +if(COMPILER_RT_HAS_ASM_SME) list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c) message(STATUS "AArch64 SME ABI routines enabled") else() -- Gitee From 8ed36c38ef4102b8965e276c5da8c49da11fe44f Mon Sep 17 00:00:00 2001 From: chenmiao Date: Tue, 4 Mar 2025 19:28:34 +0800 Subject: [PATCH 77/77] [SME] solve conflicts for git pull Merge branch 'dev_17.0.6' of https://gitee.com/chenmiao32/llvm-project into dev_17.0.6 --- ...ate-sm-changing-call-disable-coalescing.ll | 38 +++++++++---------- .../sme-streaming-compatible-interface.ll | 2 +- ...nging-call-disable-stackslot-scavenging.ll | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index 6035e2e62587..b702c71001a6 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -153,8 +153,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -193,8 +193,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -233,8 +233,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -278,8 +278,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -319,8 +319,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -360,8 +360,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -401,8 +401,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -442,8 +442,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -483,8 +483,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -524,8 +524,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -569,8 +569,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -609,8 +609,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -649,8 +649,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -689,8 +689,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -729,8 +729,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -769,8 +769,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -809,8 +809,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -849,8 +849,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov x19, x0 @@ -892,8 +892,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: add x8, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index a99818af3cfb..d1c542bb1c5c 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,8 +129,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index b0cddf6dbb60..0e6de90289e9 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -20,8 +20,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill -- Gitee