Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics · rust-lang/rust@6a53ace

@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

172172

}

173173

}

174174

}

175-

"llvm.x86.avx2.vperm2i128" => {

175+

"llvm.x86.avx2.vperm2i128"

176+

| "llvm.x86.avx.vperm2f128.ps.256"

177+

| "llvm.x86.avx.vperm2f128.pd.256" => {

176178

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256

179+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps

180+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd

177181

let (a, b, imm8) = match args {

178182

[a, b, imm8] => (a, b, imm8),

179183

_ => bug!("wrong number of args for intrinsic {intrinsic}"),

@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

182186

let b = codegen_operand(fx, b);

183187

let imm8 = codegen_operand(fx, imm8).load_scalar(fx);

184188185-

let a_0 = a.value_lane(fx, 0).load_scalar(fx);

186-

let a_1 = a.value_lane(fx, 1).load_scalar(fx);

187-

let a_low = fx.bcx.ins().iconcat(a_0, a_1);

188-

let a_2 = a.value_lane(fx, 2).load_scalar(fx);

189-

let a_3 = a.value_lane(fx, 3).load_scalar(fx);

190-

let a_high = fx.bcx.ins().iconcat(a_2, a_3);

189+

let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);

190+

let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

191191192-

let b_0 = b.value_lane(fx, 0).load_scalar(fx);

193-

let b_1 = b.value_lane(fx, 1).load_scalar(fx);

194-

let b_low = fx.bcx.ins().iconcat(b_0, b_1);

195-

let b_2 = b.value_lane(fx, 2).load_scalar(fx);

196-

let b_3 = b.value_lane(fx, 3).load_scalar(fx);

197-

let b_high = fx.bcx.ins().iconcat(b_2, b_3);

192+

let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);

193+

let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

198194199195

fn select4(

200196

fx: &mut FunctionCx<'_, '_, '_>,

@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

219215220216

let control0 = imm8;

221217

let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);

222-

let (res_0, res_1) = fx.bcx.ins().isplit(res_low);

223218224219

let control1 = fx.bcx.ins().ushr_imm(imm8, 4);

225220

let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);

226-

let (res_2, res_3) = fx.bcx.ins().isplit(res_high);

227221228-

ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());

229-

ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());

230-

ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());

231-

ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());

222+

ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(

223+

fx,

224+

res_low,

225+

MemFlags::trusted(),

226+

);

227+

ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(

228+

fx,

229+

res_high,

230+

MemFlags::trusted(),

231+

);

232232

}

233233

"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {

234234

let a = match args {