[PATCH v2] Support Intel AVX10.2 BF16 instructions
Jan Beulich
jbeulich@suse.com
Fri Oct 18 13:07:14 GMT 2024
More information about the Binutils mailing list
Fri Oct 18 13:07:14 GMT 2024
- Previous message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
- Next message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
On 17.10.2024 09:28, Haochen Jiang wrote:
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/avx10_2-256-bf16.s
> @@ -0,0 +1,169 @@
> +# Check 32bit AVX10.2/256 instructions
> +
> + .arch generic32
> + .arch .avx10.2/256
> + .text
> +_start:
> + .irp m, addne, divne, max, min, mulne, scalef, subne
> + v\m\()pbf16 %ymm4, %ymm5, %ymm6
> + v\m\()pbf16 %xmm4, %xmm5, %xmm6
> + v\m\()pbf16 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + v\m\()pbf16 (%ecx){1to16}, %ymm5, %ymm6
> + v\m\()pbf16 4064(%ecx), %ymm5, %ymm6
> + v\m\()pbf16 -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> + v\m\()pbf16 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + v\m\()pbf16 (%ecx){1to8}, %xmm5, %xmm6
> + v\m\()pbf16 2032(%ecx), %xmm5, %xmm6
> + v\m\()pbf16 -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> + .endr
> +
> + .irp m, madd, msub, nmadd, nmsub
> + .irp n, 132, 213, 231
> + vf\m\n\(\())nepbf16 %ymm4, %ymm5, %ymm6
> + vf\m\n\(\())nepbf16 %xmm4, %xmm5, %xmm6
> + vf\m\n\(\())nepbf16 0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> + vf\m\n\(\())nepbf16 (%ecx){1to16}, %ymm5, %ymm6
> + vf\m\n\(\())nepbf16 4064(%ecx), %ymm5, %ymm6
> + vf\m\n\(\())nepbf16 -256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> + vf\m\n\(\())nepbf16 0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> + vf\m\n\(\())nepbf16 (%ecx){1to8}, %xmm5, %xmm6
> + vf\m\n\(\())nepbf16 2032(%ecx), %xmm5, %xmm6
> + vf\m\n\(\())nepbf16 -256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> + .endr
> + .endr
> +
> + .irp m, getexp, rcp, rsqrt, sqrtne
> + v\m\()pbf16 %xmm5, %xmm6
> + v\m\()pbf16 %ymm5, %ymm6
> + v\m\()pbf16 0x10000000(%esp, %esi, 8), %xmm6{%k7}
> + v\m\()pbf16 (%ecx){1to8}, %xmm6
> + v\m\()pbf16 2032(%ecx), %xmm6
> + v\m\()pbf16 -256(%edx){1to8}, %xmm6{%k7}{z}
> + v\m\()pbf16 0x10000000(%esp, %esi, 8), %ymm6{%k7}
> + v\m\()pbf16 (%ecx){1to16}, %ymm6
> + v\m\()pbf16 4064(%ecx), %ymm6
> + v\m\()pbf16 -256(%edx){1to16}, %ymm6{%k7}{z}
> + .endr
> +
> + .irp m, getmant, reducene, rndscalene
> + v\m\()pbf16 $123, %ymm5, %ymm6
> + v\m\()pbf16 $123, %xmm5, %xmm6
> + v\m\()pbf16 $123, 0x10000000(%esp, %esi, 8), %xmm6{%k7}
> + v\m\()pbf16 $123, (%ecx){1to8}, %xmm6
> + v\m\()pbf16 $123, 2032(%ecx), %xmm6
> + v\m\()pbf16 $123, -256(%edx){1to8}, %xmm6{%k7}{z}
> + v\m\()pbf16 $123, 0x10000000(%esp, %esi, 8), %ymm6{%k7}
> + v\m\()pbf16 $123, (%ecx){1to16}, %ymm6
> + v\m\()pbf16 $123, 4064(%ecx), %ymm6
> + v\m\()pbf16 $123, -256(%edx){1to16}, %ymm6{%k7}{z}
> + .endr
> +
> + vcmppbf16 $123, %ymm4, %ymm5, %k5
> + vcmppbf16 $123, %xmm4, %xmm5, %k5
> + vcmppbf16 $123, 0x10000000(%esp, %esi, 8), %xmm5, %k5{%k7}
> + vcmppbf16 $123, (%ecx){1to8}, %xmm5, %k5
> + vcmppbf16 $123, 2032(%ecx), %xmm5, %k5
> + vcmppbf16 $123, -256(%edx){1to8}, %xmm5, %k5{%k7}
> + vcmppbf16 $123, 0x10000000(%esp, %esi, 8), %ymm5, %k5{%k7}
> + vcmppbf16 $123, (%ecx){1to16}, %ymm5, %k5
> + vcmppbf16 $123, 4064(%ecx), %ymm5, %k5
> + vcmppbf16 $123, -256(%edx){1to16}, %ymm5, %k5{%k7}
> +
> + vcomsbf16 %xmm5, %xmm6
> + vcomsbf16 0x10000000(%esp, %esi, 8), %xmm6
> + vcomsbf16 (%ecx), %xmm6
> + vcomsbf16 254(%ecx), %xmm6
> + vcomsbf16 -256(%edx), %xmm6
> +
> + vfpclasspbf16 $123, %ymm5, %k5
> + vfpclasspbf16 $123, %xmm5, %k5
> + vfpclasspbf16x $123, 0x10000000(%esp, %esi, 8), %k5{%k7}
> + vfpclasspbf16 $123, (%ecx){1to8}, %k5
> + vfpclasspbf16x $123, 2032(%ecx), %k5
> + vfpclasspbf16 $123, -256(%edx){1to8}, %k5{%k7}
> + vfpclasspbf16 $123, (%ecx){1to16}, %k5
> + vfpclasspbf16y $123, 4064(%ecx), %k5
> + vfpclasspbf16 $123, -256(%edx){1to16}, %k5{%k7}
> +
> +_intel:
> + .intel_syntax noprefix
> + .irp m, addne, divne, max, min, mulne, scalef, subne
> + v\m\()pbf16 ymm6, ymm5, ymm4
> + v\m\()pbf16 xmm6, xmm5, xmm4
> + v\m\()pbf16 ymm6{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000]
> + v\m\()pbf16 ymm6, ymm5, WORD PTR [ecx]{1to16}
> + v\m\()pbf16 ymm6, ymm5, YMMWORD PTR [ecx+4064]
> + v\m\()pbf16 ymm6{k7}{z}, ymm5, WORD PTR [edx-256]{1to16}
> + v\m\()pbf16 xmm6{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000]
> + v\m\()pbf16 xmm6, xmm5, WORD PTR [ecx]{1to8}
> + v\m\()pbf16 xmm6, xmm5, XMMWORD PTR [ecx+2032]
> + v\m\()pbf16 xmm6{k7}{z}, xmm5, WORD PTR [edx-256]{1to8}
> + .endr
> +
> + .irp m, madd, msub, nmadd, nmsub
> + .irp n, 132, 213, 231
> + vf\m\n\(\())nepbf16 ymm6, ymm5, ymm4
> + vf\m\n\(\())nepbf16 xmm6, xmm5, xmm4
> + vf\m\n\(\())nepbf16 ymm6{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000]
> + vf\m\n\(\())nepbf16 ymm6, ymm5, WORD PTR [ecx]{1to16}
> + vf\m\n\(\())nepbf16 ymm6, ymm5, YMMWORD PTR [ecx+4064]
> + vf\m\n\(\())nepbf16 ymm6{k7}{z}, ymm5, WORD PTR [edx-256]{1to16}
> + vf\m\n\(\())nepbf16 xmm6{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000]
> + vf\m\n\(\())nepbf16 xmm6, xmm5, WORD PTR [ecx]{1to8}
> + vf\m\n\(\())nepbf16 xmm6, xmm5, XMMWORD PTR [ecx+2032]
> + vf\m\n\(\())nepbf16 xmm6{k7}{z}, xmm5, WORD PTR [edx-256]{1to8}
> + .endr
> + .endr
> +
> + .irp m, getexp, rcp, rsqrt, sqrtne
> + v\m\()pbf16 xmm6, xmm5
> + v\m\()pbf16 ymm6, ymm5
> + v\m\()pbf16 xmm6{k7}, XMMWORD PTR [esp+esi*8+0x10000000]
> + v\m\()pbf16 xmm6, WORD PTR [ecx]{1to8}
> + v\m\()pbf16 xmm6, XMMWORD PTR [ecx+2032]
> + v\m\()pbf16 xmm6{k7}{z}, WORD PTR [edx-256]{1to8}
> + v\m\()pbf16 ymm6{k7}, YMMWORD PTR [esp+esi*8+0x10000000]
> + v\m\()pbf16 ymm6, WORD PTR [ecx]{1to16}
> + v\m\()pbf16 ymm6, YMMWORD PTR [ecx+4064]
> + v\m\()pbf16 ymm6{k7}{z}, WORD PTR [edx-256]{1to16}
> + .endr
> +
> + .irp m, getmant, reducene, rndscalene
> + v\m\()pbf16 ymm6, ymm5, 123
> + v\m\()pbf16 xmm6, xmm5, 123
> + v\m\()pbf16 xmm6{k7}, XMMWORD PTR [esp+esi*8+0x10000000], 123
> + v\m\()pbf16 xmm6, WORD PTR [ecx]{1to8}, 123
> + v\m\()pbf16 xmm6, XMMWORD PTR [ecx+2032], 123
> + v\m\()pbf16 xmm6{k7}{z}, WORD PTR [edx-256]{1to8}, 123
> + v\m\()pbf16 ymm6{k7}, YMMWORD PTR [esp+esi*8+0x10000000], 123
> + v\m\()pbf16 ymm6, WORD PTR [ecx]{1to16}, 123
> + v\m\()pbf16 ymm6, YMMWORD PTR [ecx+4064], 123
> + v\m\()pbf16 ymm6{k7}{z}, WORD PTR [edx-256]{1to16}, 123
> + .endr
> +
> + vcmppbf16 k5, ymm5, ymm4, 123
> + vcmppbf16 k5, xmm5, xmm4, 123
> + vcmppbf16 k5{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000], 123
> + vcmppbf16 k5, xmm5, WORD PTR [ecx]{1to8}, 123
> + vcmppbf16 k5, xmm5, XMMWORD PTR [ecx+2032], 123
> + vcmppbf16 k5{k7}, xmm5, WORD PTR [edx-256]{1to8}, 123
> + vcmppbf16 k5{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000], 123
> + vcmppbf16 k5, ymm5, WORD PTR [ecx]{1to16}, 123
> + vcmppbf16 k5, ymm5, YMMWORD PTR [ecx+4064], 123
> + vcmppbf16 k5{k7}, ymm5, WORD PTR [edx-256]{1to16}, 123
> +
> + vcomsbf16 xmm6, xmm5
> + vcomsbf16 xmm6, WORD PTR [esp+esi*8+0x10000000]
> + vcomsbf16 xmm6, WORD PTR [ecx]
> + vcomsbf16 xmm6, WORD PTR [ecx+254]
> + vcomsbf16 xmm6, WORD PTR [edx-256]
> +
> + vfpclasspbf16 k5, ymm5, 123
> + vfpclasspbf16 k5, xmm5, 123
> + vfpclasspbf16 k5{k7}, XMMWORD PTR [esp+esi*8+0x10000000], 123
> + vfpclasspbf16 k5, WORD PTR [ecx]{1to8}, 123
Here and in similar places: Can you please make sure you at least test a
mix of broadcast forms with and without "WORD PTR"? The broadcast
specifier alone disambiguates things.
For all other insns you want to omit the size specifiers as well, at
least now and then.
> --- a/opcodes/i386-dis-evex-prefix.h
> +++ b/opcodes/i386-dis-evex-prefix.h
> @@ -297,6 +297,7 @@
> { "vrndscalep%XH", { XM, EXxh, EXxEVexS, Ib }, 0 },
> { Bad_Opcode },
> { "vrndscalep%XS", { XM, EXx, EXxEVexS, Ib }, 0 },
> + { "vrndscalenep%XB", { XM, EXxh, Ib }, 0 },
> },
> /* PREFIX_EVEX_0F3A0A */
> {
> @@ -309,6 +310,7 @@
> { "vgetmantp%XH", { XM, EXxh, EXxEVexS, Ib }, 0 },
> { Bad_Opcode },
> { "vgetmantp%XW", { XM, EXx, EXxEVexS, Ib }, 0 },
> + { "vgetmantp%XB", { XM, EXxh, Ib }, 0 },
> },
> /* PREFIX_EVEX_0F3A27 */
> {
> @@ -327,6 +329,7 @@
> { "vreducep%XH", { XM, EXxh, EXxEVexS, Ib }, 0 },
> { Bad_Opcode },
> { "vreducep%XW", { XM, EXx, EXxEVexS, Ib }, 0 },
> + { "vreducenep%XB", { XM, EXxh, Ib }, 0 },
> },
> /* PREFIX_EVEX_0F3A57 */
> {
> @@ -339,6 +342,7 @@
> { "vfpclassp%XH%XZ", { MaskG, EXxh, Ib }, 0 },
> { Bad_Opcode },
> { "vfpclassp%XW%XZ", { MaskG, EXx, Ib }, 0 },
> + { "vfpclassp%XB%XZ", { MaskG, EXxh, Ib }, 0 },
> },
> /* PREFIX_EVEX_0F3A67 */
> {
> @@ -350,6 +354,8 @@
> {
> { "vcmpp%XH", { MaskG, Vex, EXxh, EXxEVexS, CMP }, 0 },
> { "vcmps%XH", { MaskG, VexScalar, EXw, EXxEVexS, CMP }, 0 },
> + { Bad_Opcode },
> + { "vcmpp%XB", { MaskG, Vex, EXxh, CMP }, 0 },
> },
> /* PREFIX_EVEX_MAP4_4x */
> {
> @@ -440,21 +446,26 @@
> /* PREFIX_EVEX_MAP5_2F */
> {
> { "vcomisY%XH", { XMScalar, EXw, EXxEVexS }, 0 },
> + { Bad_Opcode },
> + { "vcoms%XB", { XMScalar, EXw, EXxEVexS }, 0 },
> },
> /* PREFIX_EVEX_MAP5_51 */
> {
> { "vsqrtp%XH", { XM, EXxh, EXxEVexR }, 0 },
> { "vsqrts%XH", { XMScalar, VexScalar, EXw, EXxEVexR }, 0 },
> + { "vsqrtnep%XB", { XM, EXxh }, 0 },
At the example of this: Can you please avoid adding further inconsistent
padding between menmonic string and operands. I realize it's already not
very consistent (apparently largely from FP16 additions), but it would
be nice if things didn't grow yet worse. Of course ...
> },
> /* PREFIX_EVEX_MAP5_58 */
> {
> { "vaddp%XH", { XM, Vex, EXxh, EXxEVexR }, 0 },
> { "vadds%XH", { XMScalar, VexScalar, EXw, EXxEVexR }, 0 },
> + { "vaddnep%XB", { XM, Vex, EXxh }, 0 },
... in cases like this one all is fine: The new mnemonic string simply
is longer than what present padding accounts for. No need to increase
the diff / churn by adding more padding there.
> @@ -540,6 +555,30 @@
> { Bad_Opcode },
> { "vcvtp%XH2psx", { XM, EXxmmqh, EXxEVexS }, 0 },
> },
> + /* PREFIX_EVEX_MAP6_2C */
> + {
> + { "vscalefp%XB", { XM, Vex, EXxh }, 0 },
> + { Bad_Opcode },
> + { "vscalefp%XH", { XM, Vex, EXxh, EXxEVexR }, 0 },
> + },
> + /* PREFIX_EVEX_MAP6_42 */
> + {
> + { "vgetexpp%XB", { XM, EXxh }, 0 },
> + { Bad_Opcode },
> + { "vgetexpp%XH", { XM, EXxh, EXxEVexS }, 0 },
> + },
> + /* PREFIX_EVEX_MAP6_4C */
> + {
> + { "vrcpp%XB", { XM, EXxh }, 0 },
> + { Bad_Opcode },
> + { "vrcpp%XH", { XM, EXxh }, 0 },
> + },
> + /* PREFIX_EVEX_MAP6_4E */
> + {
> + { "vrsqrtp%XB", { XM, EXxh }, 0 },
> + { Bad_Opcode },
> + { "vrsqrtp%XH", { XM, EXxh }, 0 },
> + },
In entirely new table entries you of course want to be consistent, too
(in itself as well as with adjacent entries).
> @@ -1830,6 +1846,7 @@ struct dis386 {
> "LP" => print 'w' or 'l' ('d' in Intel mode) if instruction has
> an operand size prefix, or suffix_always is true. print
> 'q' if rex prefix is present.
> + "XB" => print 'bf16' if EVEX.W=0, EVEX.W=1 is not a valid encoding (for BF16)
Can this please be put next to "XH"? Also please pay attention to line
length.
> @@ -10445,6 +10462,24 @@ putop (instr_info *ins, const char *in_template, int sizeflag)
>
> goto case_B;
> }
> + else if (l == 1 && last[0] == 'X')
> + {
> + if (ins->vex.w == 0)
Please don't use == or != with booleans.
> + {
> + *ins->obufp++ = 'b';
> + *ins->obufp++ = 'f';
> + *ins->obufp++ = '1';
> + *ins->obufp++ = '6';
> + }
> + else
> + {
> + *ins->obufp++ = '{';
> + *ins->obufp++ = 'b';
> + *ins->obufp++ = 'a';
> + *ins->obufp++ = 'd';
> + *ins->obufp++ = '}';
> + }
We need to stop such (incomplete) open-coding of oappend(). %XH handling
(which one might expect you cloned this from) does use the function.
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -3452,4 +3452,24 @@ vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex128|VexW0|Masking|Disp8MemShift=
> vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex256|VexW0|Masking|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
> vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex512|VexW0|Masking|Disp8MemShift=5|NoSuf, { RegYMM|Unspecified|BaseIndex, RegZMM }
>
> +v<fop><fop:ne>pbf16, 0x66<fop:opc>, AVX10_2, Modrm|EVexMap5|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
> +
> +v<fm><fma>nepbf16, 0x<fm:opc3> | 0x<fma:opc>, AVX10_2, Modrm|EVexMap6|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
> +
> +vcmppbf16, 0xf2c2, AVX10_2, Modrm|Space0F3A|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegMask }
> +vfpclasspbf16, 0xf266, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|BaseIndex, RegMask }
> +vfpclasspbf16x, 0xf266, AVX10_2, Modrm|Space0F3A|EVex128|VexW0|Masking|Disp8MemShift=4|NoSuf|ATTSyntax, { Imm8, RegXMM|Unspecified|BaseIndex, RegMask }
> +vfpclasspbf16y, 0xf266, AVX10_2, Modrm|Space0F3A|EVex256|VexW0|Masking|Disp8MemShift=5|NoSuf|ATTSyntax, { Imm8, RegYMM|Unspecified|BaseIndex, RegMask }
> +vfpclasspbf16z, 0xf266, AVX10_2, Modrm|Space0F3A|EVex512|VexW0|Masking|Disp8MemShift=6|NoSuf|ATTSyntax, { Imm8, RegZMM|Unspecified|BaseIndex, RegMask }
I was meaning to ask for use of <xyz> here, just like VFPCLASSPH does. But it
looks like that wouldn't be quite right. Nevertheless I think there's a minor
issue with AVX10.2/128 mode here, where in Intel syntax no "xmmword ptr" would
be needed anymore (for disambiguation). Since that mode is hypothetical as of
now, I'm not going to insist that you address this issue.
> +vgetexppbf16, 0x42, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vgetmantpbf16, 0xf226, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrcppbf16, 0x4c, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vreducenepbf16, 0xf256, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrndscalenepbf16, 0xf208, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrsqrtpbf16, 0x4e, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vscalefpbf16, 0x2c, AVX10_2, Modrm|EVexMap6|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
While this matches the spec, I still wonder: Not VSCALEFNEPBF16? I guess
what I'm struggling with is the pattern when the NE infix would be added.
My best present guess is that it means "no embedded rounding", and is
intended to be used when the counterpart PS/PD/PH insns would support
{er}. The issue would then extend to at least VMINMAXNEPBF16, which imo
wants to be VMINMAXPBF16 (matching V{MIN,MAX}PBF16) and VRNDSCALENEPBF16
(ought to be VRNDSCALEPBF16); I may have overlooked others.
> +vsqrtnepbf16, 0x6651, AVX10_2, Modrm|EVexMap5|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +
> +vcomsbf16, 0x662f, AVX10_2, Modrm|EVexMap5|EVexLIG|VexW0|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }
While this matches the spec, I further wonder: No VUCOMSBF16?
Jan
- Previous message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
- Next message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the Binutils mailing list