[Arm64EC] Add support for `half` by tgross35 · Pull Request #152843 · llvm/llvm-project
@llvm/pr-subscribers-backend-aarch64
Author: Trevor Gross (tgross35)
Changes
f16 is passed and returned in vector registers on both x86 on AArch64, the same calling convention as f32, so it is a straightforward type to support. The calling convention support already exists, added as part of a6065f0 ("Arm64EC entry/exit thunks, consolidated. (#79067)"). Thus, add mangling and remove the error in order to make half work.
MSVC does not yet support _Float16, so for now this will remain an LLVM-only extension.
Full diff: https://github.com/llvm/llvm-project/pull/152843.diff
6 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp (+18-6)
- (modified) llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll (+5-5)
- (modified) llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll (+39-36)
- (modified) llvm/test/CodeGen/AArch64/frexp-arm64ec.ll (+9)
- (modified) llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll (+9)
- (modified) llvm/test/CodeGen/AArch64/powi-arm64ec.ll (+11-1)
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index ad8368e1692be..aa52b71b862ab 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -316,6 +316,11 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( ThunkArgTranslation::PointerIndirection}; }; + if (T->isHalfTy()) { + Out << "h"; + return direct(T); + } + if (T->isFloatTy()) { Out << "f"; return direct(T); @@ -327,8 +332,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( } if (T->isFloatingPointTy()) { - report_fatal_error( - "Only 32 and 64 bit floating points are supported for ARM64EC thunks"); + report_fatal_error("Only 16, 32, and 64 bit floating points are supported " + "for ARM64EC thunks"); } auto &DL = M->getDataLayout(); @@ -342,8 +347,15 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( uint64_t ElementCnt = T->getArrayNumElements(); uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8; uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes; - if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) { - Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes; + if (ElementTy->isHalfTy() || ElementTy->isFloatTy() || + ElementTy->isDoubleTy()) { + if (ElementTy->isHalfTy()) + Out << "H"; + else if (ElementTy->isFloatTy()) + Out << "F"; + else if (ElementTy->isDoubleTy()) + Out << "D"; + Out << TotalSizeBytes; if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); if (TotalSizeBytes <= 8) { @@ -355,8 +367,8 @@ ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( return pointerIndirection(T); } } else if (T->isFloatingPointTy()) { - report_fatal_error("Only 32 and 64 bit floating points are supported for " - "ARM64EC thunks"); + report_fatal_error("Only 16, 32, and 64 bit floating points are supported " + "for ARM64EC thunks"); } } diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll index 6aeeeed94543d..8b70c2738e3f4 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll @@ -85,10 +85,10 @@ define i64 @simple_integers(i8, i16, i32, i64) nounwind { ret i64 0 } -; NOTE: Only float and double are supported. -define double @simple_floats(float, double) nounwind { -; CHECK-LABEL: .def $ientry_thunk$cdecl$d$fd; -; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$d$fd +; NOTE: Only half, float, and double are supported. +define double @simple_floats(half, float, double) nounwind { +; CHECK-LABEL: .def $ientry_thunk$cdecl$d$hfd; +; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$d$hfd ; CHECK: // %bb.0: ; CHECK-NEXT: stp q6, q7, [sp, #-176]! // 32-byte Folded Spill ; CHECK-NEXT: .seh_save_any_reg_px q6, 176 @@ -600,7 +600,7 @@ start: ; CHECK-NEXT: .symidx $ientry_thunk$cdecl$i8$i8i8i8i8 ; CHECK-NEXT: .word 1 ; CHECK-NEXT: .symidx "#simple_floats" -; CHECK-NEXT: .symidx $ientry_thunk$cdecl$d$fd +; CHECK-NEXT: .symidx $ientry_thunk$cdecl$d$hfd ; CHECK-NEXT: .word 1 ; CHECK-NEXT: .symidx "#has_varargs" ; CHECK-NEXT: .symidx $ientry_thunk$cdecl$v$varargs diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll index cba7a8100930f..22bffc7a0a33d 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll @@ -93,10 +93,10 @@ declare i64 @simple_integers(i8, i16, i32, i64) nounwind; ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc -; NOTE: Only float and double are supported. -declare double @simple_floats(float, double) nounwind; -; CHECK-LABEL: .def $iexit_thunk$cdecl$d$fd; -; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$d$fd +; NOTE: Only half, float, and double are supported. +declare double @simple_floats(half, float, double) nounwind; +; CHECK-LABEL: .def $iexit_thunk$cdecl$d$hfd; +; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$d$hfd ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: .seh_stackalloc 48 @@ -129,8 +129,8 @@ declare double @simple_floats(float, double) nounwind; ; CHECK-NEXT: adrp x11, simple_floats ; CHECK-NEXT: add x11, x11, :lo12:simple_floats ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$d$fd -; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$d$fd +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$d$hfd +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$d$hfd ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -282,33 +282,36 @@ declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind; ; CHECK: .seh_endfunclet ; CHECK: .seh_endproc -declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind; -; CHECK-LABEL: .def $iexit_thunk$cdecl$m2$m2F8; -; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m2$m2F8 +declare [2 x i8] @small_array([2 x i8], [2 x half], [2 x float]) nounwind; +; CHECK-LABEL: .def $iexit_thunk$cdecl$m2$m2mF8; +; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m2$m2mF8 ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 -; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .seh_save_fplr 48 -; CHECK-NEXT: add x29, sp, #48 -; CHECK-NEXT: .seh_add_fp 48 +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .seh_stackalloc 80 +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 64 +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .seh_add_fp 64 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: sturb w1, [x29, #-1] -; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect -; CHECK-NEXT: sturb w0, [x29, #-2] -; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] -; CHECK-NEXT: stp s0, s1, [x29, #-12] -; CHECK-NEXT: ldurh w0, [x29, #-2] -; CHECK-NEXT: ldur x1, [x29, #-12] -; CHECK-NEXT: blr x16 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: sturh w8, [x29, #-14] -; CHECK-NEXT: ubfx w1, w8, #8, #8 +; CHECK-NEXT: sturb w0, [x29, #-2] +; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect +; CHECK-NEXT: sturb w1, [x29, #-1] +; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] +; CHECK-NEXT: stur h0, [x29, #-6] +; CHECK-NEXT: ldurh w0, [x29, #-2] +; CHECK-NEXT: stur h1, [x29, #-4] +; CHECK-NEXT: stp s2, s3, [x29, #-16] +; CHECK-NEXT: ldur w1, [x29, #-6] +; CHECK-NEXT: ldur x2, [x29, #-16] +; CHECK-NEXT: blr x16 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: sturh w8, [x29, #-18] +; CHECK-NEXT: ubfx w1, w8, #8, #8 ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: .seh_save_fplr 48 -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 64 +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: .seh_stackalloc 80 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet @@ -325,8 +328,8 @@ declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind; ; CHECK-NEXT: adrp x11, small_array ; CHECK-NEXT: add x11, x11, :lo12:small_array ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m2$m2F8 -; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m2$m2F8 +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m2$m2mF8 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m2$m2mF8 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -577,7 +580,7 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind; ; CHECK-NEXT: .symidx simple_integers ; CHECK-NEXT: .word 0 ; CHECK-NEXT: .symidx simple_floats -; CHECK-NEXT: .symidx $iexit_thunk$cdecl$d$fd +; CHECK-NEXT: .symidx $iexit_thunk$cdecl$d$hfd ; CHECK-NEXT: .word 4 ; CHECK-NEXT: .symidx "#simple_floats$exit_thunk" ; CHECK-NEXT: .symidx simple_floats @@ -601,7 +604,7 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind; ; CHECK-NEXT: .symidx has_aligned_sret ; CHECK-NEXT: .word 0 ; CHECK-NEXT: .symidx small_array -; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m2$m2F8 +; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m2$m2mF8 ; CHECK-NEXT: .word 4 ; CHECK-NEXT: .symidx "#small_array$exit_thunk" ; CHECK-NEXT: .symidx small_array @@ -634,14 +637,14 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind; define void @func_caller() nounwind { call void @no_op() call i64 @simple_integers(i8 0, i16 0, i32 0, i64 0) - call double @simple_floats(float 0.0, double 0.0) + call double @simple_floats(half 0.0, float 0.0, double 0.0) call void (...) @has_varargs() %c = alloca i8 call void @has_sret(ptr sret([100 x i8]) %c) %aligned = alloca %TSRet, align 32 store %TSRet { i64 0, i64 0 }, ptr %aligned, align 32 call void @has_aligned_sret(ptr align 32 sret(%TSRet) %aligned) - call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x float] [float 0.0, float 0.0]) + call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x half] [half 0.0, half 0.0], [2 x float] [float 0.0, float 0.0]) call [3 x i64] @large_array([3 x i64] [i64 0, i64 0, i64 0], [2 x double] [double 0.0, double 0.0], [2 x [2 x i64]] [[2 x i64] [i64 0, i64 0], [2 x i64] [i64 0, i64 0]]) call %T2 @simple_struct(%T1 { i16 0 }, %T2 { i32 0, float 0.0 }, %T3 { i64 0, double 0.0 }, %T4 { i64 0, double 0.0, i8 0 }) call <4 x i8> @small_vector(<4 x i8> <i8 0, i8 0, i8 0, i8 0>) diff --git a/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll b/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll index ee326caa77c0a..c27d3c9588b9d 100644 --- a/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/frexp-arm64ec.ll @@ -2,6 +2,15 @@ ; Separate from llvm-frexp.ll test because this errors on half cases +; ARM64EC-LABEL: test_frexp_f16_i32 +; ARM64EC: fcvt d0, h0 +; ARM64EC: bl "#frexp" +; ARM64EC: fcvt h0, d0 +define { half, i32 } @test_frexp_f16_i32(half %a) { + %result = call { half, i32 } @llvm.frexp.f16.i32(half %a) + ret { half, i32 } %result +} + ; ARM64EC-LABEL: test_frexp_f32_i32 ; ARM64EC: fcvt d0, s0 ; ARM64EC: bl "#frexp" diff --git a/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll b/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll index 1f8eeccf9c338..0fde7b95f5462 100644 --- a/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/ldexp-arm64ec.ll @@ -3,6 +3,15 @@ ; Separate from ldexp.ll test because this errors on half cases +; ARM64EC-LABEL: ldexp_f16 = +; ARM64EC: fcvt d0, h0 +; ARM64EC: bl "#ldexp" +; ARM64EC: fcvt h0, d0 +define half @ldexp_f16(half %val, i32 %a) { + %call = call half @llvm.ldexp.f16(half %val, i32 %a) + ret half %call +} + ; ARM64EC-LABEL: ldexp_f32 = ; ARM64EC: fcvt d0, s0 ; ARM64EC: bl "#ldexp" diff --git a/llvm/test/CodeGen/AArch64/powi-arm64ec.ll b/llvm/test/CodeGen/AArch64/powi-arm64ec.ll index 707159eb432ec..2e38f3c5e9a54 100644 --- a/llvm/test/CodeGen/AArch64/powi-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/powi-arm64ec.ll @@ -1,8 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=arm64ec-windows-msvc < %s | FileCheck -check-prefix=ARM64EC %s -declare double @llvm.powi.f64.i32(double, i32) +declare half @llvm.powi.f16.i32(half, i32) declare float @llvm.powi.f32.i32(float, i32) +declare double @llvm.powi.f64.i32(double, i32) + +; ARM64EC-LABEL: powi_f16 +; ARM64EC: fcvt s0, h0 +; ARM64EC: scvtf s1, w0 +; ARM64EC: bl "#powf" +define half @powi_f16(half %x, i32 %n) nounwind { + %ret = tail call half @llvm.powi.f16.i32(half %x, i32 %n) + ret half %ret +} ; ARM64EC-LABEL: powi_f32 ; ARM64EC: scvtf s1, w0