Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics · rust-lang/rust@6a53ace
@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
172172}
173173}
174174}
175-"llvm.x86.avx2.vperm2i128" => {
175+"llvm.x86.avx2.vperm2i128"
176+ | "llvm.x86.avx.vperm2f128.ps.256"
177+ | "llvm.x86.avx.vperm2f128.pd.256" => {
176178// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
179+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
180+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
177181let (a, b, imm8) = match args {
178182[a, b, imm8] => (a, b, imm8),
179183 _ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
182186let b = codegen_operand(fx, b);
183187let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
184188185-let a_0 = a.value_lane(fx, 0).load_scalar(fx);
186-let a_1 = a.value_lane(fx, 1).load_scalar(fx);
187-let a_low = fx.bcx.ins().iconcat(a_0, a_1);
188-let a_2 = a.value_lane(fx, 2).load_scalar(fx);
189-let a_3 = a.value_lane(fx, 3).load_scalar(fx);
190-let a_high = fx.bcx.ins().iconcat(a_2, a_3);
189+let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
190+let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
191191192-let b_0 = b.value_lane(fx, 0).load_scalar(fx);
193-let b_1 = b.value_lane(fx, 1).load_scalar(fx);
194-let b_low = fx.bcx.ins().iconcat(b_0, b_1);
195-let b_2 = b.value_lane(fx, 2).load_scalar(fx);
196-let b_3 = b.value_lane(fx, 3).load_scalar(fx);
197-let b_high = fx.bcx.ins().iconcat(b_2, b_3);
192+let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
193+let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
198194199195fn select4(
200196fx: &mut FunctionCx<'_, '_, '_>,
@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
219215220216let control0 = imm8;
221217let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
222-let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
223218224219let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
225220let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
226-let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
227221228- ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
229- ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
230- ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
231- ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
222+ ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
223+ fx,
224+ res_low,
225+MemFlags::trusted(),
226+);
227+ ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
228+ fx,
229+ res_high,
230+MemFlags::trusted(),
231+);
232232}
233233"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
234234let a = match args {