[PATCH v2] Support Intel AVX10.2 BF16 instructions

Jan Beulich jbeulich@suse.com
Fri Oct 18 13:07:14 GMT 2024
Previous message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
Next message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
On 17.10.2024 09:28, Haochen Jiang wrote:
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/avx10_2-256-bf16.s
> @@ -0,0 +1,169 @@
> +# Check 32bit AVX10.2/256 instructions
> +
> +	.arch generic32
> +	.arch .avx10.2/256
> +	.text
> +_start:
> +	.irp m, addne, divne, max, min, mulne, scalef, subne
> +	v\m\()pbf16	%ymm4, %ymm5, %ymm6
> +	v\m\()pbf16	%xmm4, %xmm5, %xmm6
> +	v\m\()pbf16	0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	v\m\()pbf16	(%ecx){1to16}, %ymm5, %ymm6
> +	v\m\()pbf16	4064(%ecx), %ymm5, %ymm6
> +	v\m\()pbf16	-256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> +	v\m\()pbf16	0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	v\m\()pbf16	(%ecx){1to8}, %xmm5, %xmm6
> +	v\m\()pbf16	2032(%ecx), %xmm5, %xmm6
> +	v\m\()pbf16	-256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> +	.endr
> +
> +	.irp m, madd, msub, nmadd, nmsub
> +	.irp n, 132, 213, 231
> +	vf\m\n\(\())nepbf16	%ymm4, %ymm5, %ymm6
> +	vf\m\n\(\())nepbf16	%xmm4, %xmm5, %xmm6
> +	vf\m\n\(\())nepbf16	0x10000000(%esp, %esi, 8), %ymm5, %ymm6{%k7}
> +	vf\m\n\(\())nepbf16	(%ecx){1to16}, %ymm5, %ymm6
> +	vf\m\n\(\())nepbf16	4064(%ecx), %ymm5, %ymm6
> +	vf\m\n\(\())nepbf16	-256(%edx){1to16}, %ymm5, %ymm6{%k7}{z}
> +	vf\m\n\(\())nepbf16	0x10000000(%esp, %esi, 8), %xmm5, %xmm6{%k7}
> +	vf\m\n\(\())nepbf16	(%ecx){1to8}, %xmm5, %xmm6
> +	vf\m\n\(\())nepbf16	2032(%ecx), %xmm5, %xmm6
> +	vf\m\n\(\())nepbf16	-256(%edx){1to8}, %xmm5, %xmm6{%k7}{z}
> +	.endr
> +	.endr
> +
> +	.irp m, getexp, rcp, rsqrt, sqrtne
> +	v\m\()pbf16	%xmm5, %xmm6
> +	v\m\()pbf16	%ymm5, %ymm6
> +	v\m\()pbf16	0x10000000(%esp, %esi, 8), %xmm6{%k7}
> +	v\m\()pbf16	(%ecx){1to8}, %xmm6
> +	v\m\()pbf16	2032(%ecx), %xmm6
> +	v\m\()pbf16	-256(%edx){1to8}, %xmm6{%k7}{z}
> +	v\m\()pbf16	0x10000000(%esp, %esi, 8), %ymm6{%k7}
> +	v\m\()pbf16	(%ecx){1to16}, %ymm6
> +	v\m\()pbf16	4064(%ecx), %ymm6
> +	v\m\()pbf16	-256(%edx){1to16}, %ymm6{%k7}{z}
> +	.endr
> +
> +	.irp m, getmant, reducene, rndscalene
> +	v\m\()pbf16	$123, %ymm5, %ymm6
> +	v\m\()pbf16	$123, %xmm5, %xmm6
> +	v\m\()pbf16	$123, 0x10000000(%esp, %esi, 8), %xmm6{%k7}
> +	v\m\()pbf16	$123, (%ecx){1to8}, %xmm6
> +	v\m\()pbf16	$123, 2032(%ecx), %xmm6
> +	v\m\()pbf16	$123, -256(%edx){1to8}, %xmm6{%k7}{z}
> +	v\m\()pbf16	$123, 0x10000000(%esp, %esi, 8), %ymm6{%k7}
> +	v\m\()pbf16	$123, (%ecx){1to16}, %ymm6
> +	v\m\()pbf16	$123, 4064(%ecx), %ymm6
> +	v\m\()pbf16	$123, -256(%edx){1to16}, %ymm6{%k7}{z}
> +	.endr
> +
> +	vcmppbf16	$123, %ymm4, %ymm5, %k5
> +	vcmppbf16	$123, %xmm4, %xmm5, %k5
> +	vcmppbf16	$123, 0x10000000(%esp, %esi, 8), %xmm5, %k5{%k7}
> +	vcmppbf16	$123, (%ecx){1to8}, %xmm5, %k5
> +	vcmppbf16	$123, 2032(%ecx), %xmm5, %k5
> +	vcmppbf16	$123, -256(%edx){1to8}, %xmm5, %k5{%k7}
> +	vcmppbf16	$123, 0x10000000(%esp, %esi, 8), %ymm5, %k5{%k7}
> +	vcmppbf16	$123, (%ecx){1to16}, %ymm5, %k5
> +	vcmppbf16	$123, 4064(%ecx), %ymm5, %k5
> +	vcmppbf16	$123, -256(%edx){1to16}, %ymm5, %k5{%k7}
> +
> +	vcomsbf16	%xmm5, %xmm6
> +	vcomsbf16	0x10000000(%esp, %esi, 8), %xmm6
> +	vcomsbf16	(%ecx), %xmm6
> +	vcomsbf16	254(%ecx), %xmm6
> +	vcomsbf16	-256(%edx), %xmm6
> +
> +	vfpclasspbf16	$123, %ymm5, %k5
> +	vfpclasspbf16	$123, %xmm5, %k5
> +	vfpclasspbf16x	$123, 0x10000000(%esp, %esi, 8), %k5{%k7}
> +	vfpclasspbf16	$123, (%ecx){1to8}, %k5
> +	vfpclasspbf16x	$123, 2032(%ecx), %k5
> +	vfpclasspbf16	$123, -256(%edx){1to8}, %k5{%k7}
> +	vfpclasspbf16	$123, (%ecx){1to16}, %k5
> +	vfpclasspbf16y	$123, 4064(%ecx), %k5
> +	vfpclasspbf16	$123, -256(%edx){1to16}, %k5{%k7}
> +
> +_intel:
> +	.intel_syntax noprefix
> +	.irp m, addne, divne, max, min, mulne, scalef, subne
> +	v\m\()pbf16	ymm6, ymm5, ymm4
> +	v\m\()pbf16	xmm6, xmm5, xmm4
> +	v\m\()pbf16	ymm6{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000]
> +	v\m\()pbf16	ymm6, ymm5, WORD PTR [ecx]{1to16}
> +	v\m\()pbf16	ymm6, ymm5, YMMWORD PTR [ecx+4064]
> +	v\m\()pbf16	ymm6{k7}{z}, ymm5, WORD PTR [edx-256]{1to16}
> +	v\m\()pbf16	xmm6{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000]
> +	v\m\()pbf16	xmm6, xmm5, WORD PTR [ecx]{1to8}
> +	v\m\()pbf16	xmm6, xmm5, XMMWORD PTR [ecx+2032]
> +	v\m\()pbf16	xmm6{k7}{z}, xmm5, WORD PTR [edx-256]{1to8}
> +	.endr
> +
> +	.irp m, madd, msub, nmadd, nmsub
> +	.irp n, 132, 213, 231
> +	vf\m\n\(\())nepbf16	ymm6, ymm5, ymm4
> +	vf\m\n\(\())nepbf16	xmm6, xmm5, xmm4
> +	vf\m\n\(\())nepbf16	ymm6{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000]
> +	vf\m\n\(\())nepbf16	ymm6, ymm5, WORD PTR [ecx]{1to16}
> +	vf\m\n\(\())nepbf16	ymm6, ymm5, YMMWORD PTR [ecx+4064]
> +	vf\m\n\(\())nepbf16	ymm6{k7}{z}, ymm5, WORD PTR [edx-256]{1to16}
> +	vf\m\n\(\())nepbf16	xmm6{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000]
> +	vf\m\n\(\())nepbf16	xmm6, xmm5, WORD PTR [ecx]{1to8}
> +	vf\m\n\(\())nepbf16	xmm6, xmm5, XMMWORD PTR [ecx+2032]
> +	vf\m\n\(\())nepbf16	xmm6{k7}{z}, xmm5, WORD PTR [edx-256]{1to8}
> +	.endr
> +	.endr
> +
> +	.irp m, getexp, rcp, rsqrt, sqrtne
> +	v\m\()pbf16	xmm6, xmm5
> +	v\m\()pbf16	ymm6, ymm5
> +	v\m\()pbf16	xmm6{k7}, XMMWORD PTR [esp+esi*8+0x10000000]
> +	v\m\()pbf16	xmm6, WORD PTR [ecx]{1to8}
> +	v\m\()pbf16	xmm6, XMMWORD PTR [ecx+2032]
> +	v\m\()pbf16	xmm6{k7}{z}, WORD PTR [edx-256]{1to8}
> +	v\m\()pbf16	ymm6{k7}, YMMWORD PTR [esp+esi*8+0x10000000]
> +	v\m\()pbf16	ymm6, WORD PTR [ecx]{1to16}
> +	v\m\()pbf16	ymm6, YMMWORD PTR [ecx+4064]
> +	v\m\()pbf16	ymm6{k7}{z}, WORD PTR [edx-256]{1to16}
> +	.endr
> +
> +	.irp m, getmant, reducene, rndscalene
> +	v\m\()pbf16	ymm6, ymm5, 123
> +	v\m\()pbf16	xmm6, xmm5, 123
> +	v\m\()pbf16	xmm6{k7}, XMMWORD PTR [esp+esi*8+0x10000000], 123
> +	v\m\()pbf16	xmm6, WORD PTR [ecx]{1to8}, 123
> +	v\m\()pbf16	xmm6, XMMWORD PTR [ecx+2032], 123
> +	v\m\()pbf16	xmm6{k7}{z}, WORD PTR [edx-256]{1to8}, 123
> +	v\m\()pbf16	ymm6{k7}, YMMWORD PTR [esp+esi*8+0x10000000], 123
> +	v\m\()pbf16	ymm6, WORD PTR [ecx]{1to16}, 123
> +	v\m\()pbf16	ymm6, YMMWORD PTR [ecx+4064], 123
> +	v\m\()pbf16	ymm6{k7}{z}, WORD PTR [edx-256]{1to16}, 123
> +	.endr
> +
> +	vcmppbf16	k5, ymm5, ymm4, 123
> +	vcmppbf16	k5, xmm5, xmm4, 123
> +	vcmppbf16	k5{k7}, xmm5, XMMWORD PTR [esp+esi*8+0x10000000], 123
> +	vcmppbf16	k5, xmm5, WORD PTR [ecx]{1to8}, 123
> +	vcmppbf16	k5, xmm5, XMMWORD PTR [ecx+2032], 123
> +	vcmppbf16	k5{k7}, xmm5, WORD PTR [edx-256]{1to8}, 123
> +	vcmppbf16	k5{k7}, ymm5, YMMWORD PTR [esp+esi*8+0x10000000], 123
> +	vcmppbf16	k5, ymm5, WORD PTR [ecx]{1to16}, 123
> +	vcmppbf16	k5, ymm5, YMMWORD PTR [ecx+4064], 123
> +	vcmppbf16	k5{k7}, ymm5, WORD PTR [edx-256]{1to16}, 123
> +
> +	vcomsbf16	xmm6, xmm5
> +	vcomsbf16	xmm6, WORD PTR [esp+esi*8+0x10000000]
> +	vcomsbf16	xmm6, WORD PTR [ecx]
> +	vcomsbf16	xmm6, WORD PTR [ecx+254]
> +	vcomsbf16	xmm6, WORD PTR [edx-256]
> +
> +	vfpclasspbf16	k5, ymm5, 123
> +	vfpclasspbf16	k5, xmm5, 123
> +	vfpclasspbf16	k5{k7}, XMMWORD PTR [esp+esi*8+0x10000000], 123
> +	vfpclasspbf16	k5, WORD PTR [ecx]{1to8}, 123

Here and in similar places: Can you please make sure you at least test a
mix of broadcast forms with and without "WORD PTR"? The broadcast
specifier alone disambiguates things.

For all other insns you want to omit the size specifiers as well, at
least now and then.

> --- a/opcodes/i386-dis-evex-prefix.h
> +++ b/opcodes/i386-dis-evex-prefix.h
> @@ -297,6 +297,7 @@
>      { "vrndscalep%XH",  { XM, EXxh, EXxEVexS, Ib }, 0 },
>      { Bad_Opcode },
>      { "vrndscalep%XS",  { XM, EXx, EXxEVexS, Ib }, 0 },
> +    { "vrndscalenep%XB",  { XM, EXxh, Ib }, 0 },
>    },
>    /* PREFIX_EVEX_0F3A0A */
>    {
> @@ -309,6 +310,7 @@
>      { "vgetmantp%XH",     { XM, EXxh, EXxEVexS, Ib }, 0 },
>      { Bad_Opcode },
>      { "vgetmantp%XW",	{ XM, EXx, EXxEVexS, Ib }, 0 },
> +    { "vgetmantp%XB",    { XM, EXxh, Ib }, 0 },
>    },
>    /* PREFIX_EVEX_0F3A27 */
>    {
> @@ -327,6 +329,7 @@
>      { "vreducep%XH",      { XM, EXxh, EXxEVexS, Ib }, 0 },
>      { Bad_Opcode },
>      { "vreducep%XW",	{ XM, EXx, EXxEVexS, Ib }, 0 },
> +    { "vreducenep%XB",      { XM, EXxh, Ib }, 0 },
>    },
>    /* PREFIX_EVEX_0F3A57 */
>    {
> @@ -339,6 +342,7 @@
>      { "vfpclassp%XH%XZ",  { MaskG, EXxh, Ib }, 0 },
>      { Bad_Opcode },
>      { "vfpclassp%XW%XZ",    { MaskG, EXx, Ib }, 0 },
> +    { "vfpclassp%XB%XZ",  { MaskG, EXxh, Ib }, 0 },
>    },
>    /* PREFIX_EVEX_0F3A67 */
>    {
> @@ -350,6 +354,8 @@
>    {
>      { "vcmpp%XH", { MaskG, Vex, EXxh, EXxEVexS, CMP }, 0 },
>      { "vcmps%XH", { MaskG, VexScalar, EXw, EXxEVexS, CMP }, 0 },
> +    { Bad_Opcode },
> +    { "vcmpp%XB", { MaskG, Vex, EXxh, CMP }, 0 },
>    },
>    /* PREFIX_EVEX_MAP4_4x */
>    {
> @@ -440,21 +446,26 @@
>    /* PREFIX_EVEX_MAP5_2F */
>    {
>      { "vcomisY%XH",        { XMScalar, EXw, EXxEVexS }, 0 },
> +    { Bad_Opcode },
> +    { "vcoms%XB",        { XMScalar, EXw, EXxEVexS }, 0 },
>    },
>    /* PREFIX_EVEX_MAP5_51 */
>    {
>      { "vsqrtp%XH",        { XM, EXxh, EXxEVexR }, 0 },
>      { "vsqrts%XH",        { XMScalar, VexScalar, EXw, EXxEVexR }, 0 },
> +    { "vsqrtnep%XB",     { XM, EXxh }, 0 },

At the example of this: Can you please avoid adding further inconsistent
padding between menmonic string and operands. I realize it's already not
very consistent (apparently largely from FP16 additions), but it would
be nice if things didn't grow yet worse. Of course ...

>    },
>    /* PREFIX_EVEX_MAP5_58 */
>    {
>      { "vaddp%XH", { XM, Vex, EXxh, EXxEVexR }, 0 },
>      { "vadds%XH", { XMScalar, VexScalar, EXw, EXxEVexR }, 0 },
> +    { "vaddnep%XB", { XM, Vex, EXxh }, 0 },

... in cases like this one all is fine: The new mnemonic string simply
is longer than what present padding accounts for. No need to increase
the diff / churn by adding more padding there.

> @@ -540,6 +555,30 @@
>      { Bad_Opcode },
>      { "vcvtp%XH2psx",	{ XM, EXxmmqh, EXxEVexS }, 0 },
>    },
> +  /* PREFIX_EVEX_MAP6_2C */
> +  {
> +    { "vscalefp%XB",      { XM, Vex, EXxh }, 0 },
> +    { Bad_Opcode },
> +    { "vscalefp%XH",      { XM, Vex, EXxh, EXxEVexR }, 0 },
> +  },
> +  /* PREFIX_EVEX_MAP6_42 */
> +  {
> +    { "vgetexpp%XB",	{ XM, EXxh }, 0 },
> +    { Bad_Opcode },
> +    { "vgetexpp%XH",	{ XM, EXxh, EXxEVexS }, 0 },
> +  },
> +  /* PREFIX_EVEX_MAP6_4C */
> +  {
> +    { "vrcpp%XB",    { XM, EXxh }, 0 },
> +    { Bad_Opcode },
> +    { "vrcpp%XH",       { XM, EXxh }, 0 },
> +  },
> +  /* PREFIX_EVEX_MAP6_4E */
> +  {
> +    { "vrsqrtp%XB",       { XM, EXxh }, 0 },
> +    { Bad_Opcode },
> +    { "vrsqrtp%XH",       { XM, EXxh }, 0 },
> +  },

In entirely new table entries you of course want to be consistent, too
(in itself as well as with adjacent entries).

> @@ -1830,6 +1846,7 @@ struct dis386 {
>     "LP" => print 'w' or 'l' ('d' in Intel mode) if instruction has
>  	   an operand size prefix, or suffix_always is true.  print
>  	   'q' if rex prefix is present.
> +   "XB" => print 'bf16' if EVEX.W=0, EVEX.W=1 is not a valid encoding (for BF16)

Can this please be put next to "XH"? Also please pay attention to line
length.

> @@ -10445,6 +10462,24 @@ putop (instr_info *ins, const char *in_template, int sizeflag)
>  
>  	      goto case_B;
>  	    }
> +	  else if (l == 1 && last[0] == 'X')
> +	    {
> +	      if (ins->vex.w == 0)

Please don't use == or != with booleans.

> +		{
> +		  *ins->obufp++ = 'b';
> +		  *ins->obufp++ = 'f';
> +		  *ins->obufp++ = '1';
> +		  *ins->obufp++ = '6';
> +		}
> +	      else
> +		{
> +		  *ins->obufp++ = '{';
> +		  *ins->obufp++ = 'b';
> +		  *ins->obufp++ = 'a';
> +		  *ins->obufp++ = 'd';
> +		  *ins->obufp++ = '}';
> +		}

We need to stop such (incomplete) open-coding of oappend(). %XH handling
(which one might expect you cloned this from) does use the function.

> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -3452,4 +3452,24 @@ vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex128|VexW0|Masking|Disp8MemShift=
>  vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex256|VexW0|Masking|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
>  vcvthf82ph, 0xf21e, AVX10_2, Modrm|EVexMap5|EVex512|VexW0|Masking|Disp8MemShift=5|NoSuf, { RegYMM|Unspecified|BaseIndex, RegZMM }
>  
> +v<fop><fop:ne>pbf16, 0x66<fop:opc>, AVX10_2, Modrm|EVexMap5|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
> +
> +v<fm><fma>nepbf16, 0x<fm:opc3> | 0x<fma:opc>, AVX10_2, Modrm|EVexMap6|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
> +
> +vcmppbf16, 0xf2c2, AVX10_2, Modrm|Space0F3A|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegMask }
> +vfpclasspbf16, 0xf266, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|BaseIndex, RegMask }
> +vfpclasspbf16x, 0xf266, AVX10_2, Modrm|Space0F3A|EVex128|VexW0|Masking|Disp8MemShift=4|NoSuf|ATTSyntax, { Imm8, RegXMM|Unspecified|BaseIndex, RegMask }
> +vfpclasspbf16y, 0xf266, AVX10_2, Modrm|Space0F3A|EVex256|VexW0|Masking|Disp8MemShift=5|NoSuf|ATTSyntax, { Imm8, RegYMM|Unspecified|BaseIndex, RegMask }
> +vfpclasspbf16z, 0xf266, AVX10_2, Modrm|Space0F3A|EVex512|VexW0|Masking|Disp8MemShift=6|NoSuf|ATTSyntax, { Imm8, RegZMM|Unspecified|BaseIndex, RegMask }

I was meaning to ask for use of <xyz> here, just like VFPCLASSPH does. But it
looks like that wouldn't be quite right. Nevertheless I think there's a minor
issue with AVX10.2/128 mode here, where in Intel syntax no "xmmword ptr" would
be needed anymore (for disambiguation). Since that mode is hypothetical as of
now, I'm not going to insist that you address this issue.

> +vgetexppbf16, 0x42, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vgetmantpbf16, 0xf226, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrcppbf16, 0x4c, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vreducenepbf16, 0xf256, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrndscalenepbf16, 0xf208, AVX10_2, Modrm|Space0F3A|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vrsqrtpbf16, 0x4e, AVX10_2, Modrm|EVexMap6|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +vscalefpbf16, 0x2c, AVX10_2, Modrm|EVexMap6|Src1VVVV|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }

While this matches the spec, I still wonder: Not VSCALEFNEPBF16? I guess
what I'm struggling with is the pattern when the NE infix would be added.
My best present guess is that it means "no embedded rounding", and is
intended to be used when the counterpart PS/PD/PH insns would support
{er}. The issue would then extend to at least VMINMAXNEPBF16, which imo
wants to be VMINMAXPBF16 (matching V{MIN,MAX}PBF16) and VRNDSCALENEPBF16
(ought to be VRNDSCALEPBF16); I may have overlooked others.

> +vsqrtnepbf16, 0x6651, AVX10_2, Modrm|EVexMap5|VexW0|Masking|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Word|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
> +
> +vcomsbf16, 0x662f, AVX10_2, Modrm|EVexMap5|EVexLIG|VexW0|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }

While this matches the spec, I further wonder: No VUCOMSBF16?

Jan
Previous message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
Next message (by thread): [PATCH v2] Support Intel AVX10.2 BF16 instructions
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the Binutils mailing list