Implement all SSE intrinsics used by the jpeg-decoder crate · rust-lang/rust@61e38ce

@@ -413,6 +413,77 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

413413

ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);

414414

}

415415

}

416+417+

"llvm.x86.ssse3.pmul.hr.sw.128" => {

418+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782

419+

intrinsic_args!(fx, args => (a, b); intrinsic);

420+421+

assert_eq!(a.layout(), b.layout());

422+

let layout = a.layout();

423+424+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

425+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

426+

assert_eq!(lane_ty, fx.tcx.types.i16);

427+

assert_eq!(ret_lane_ty, fx.tcx.types.i16);

428+

assert_eq!(lane_count, ret_lane_count);

429+430+

let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

431+

for out_lane_idx in 0..lane_count {

432+

let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);

433+

let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);

434+

let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);

435+

let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);

436+437+

let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);

438+

let shifted = fx.bcx.ins().ushr_imm(mul, 14);

439+

let incremented = fx.bcx.ins().iadd_imm(shifted, 1);

440+

let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);

441+442+

let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);

443+

let res_lane = CValue::by_val(res_lane, ret_lane_layout);

444+445+

ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);

446+

}

447+

}

448+449+

"llvm.x86.sse2.packuswb.128" => {

450+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903

451+

intrinsic_args!(fx, args => (a, b); intrinsic);

452+453+

assert_eq!(a.layout(), b.layout());

454+

let layout = a.layout();

455+456+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

457+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

458+

assert_eq!(lane_ty, fx.tcx.types.i16);

459+

assert_eq!(ret_lane_ty, fx.tcx.types.u8);

460+

assert_eq!(lane_count * 2, ret_lane_count);

461+462+

let zero = fx.bcx.ins().iconst(types::I16, 0);

463+

let max_u8 = fx.bcx.ins().iconst(types::I16, 255);

464+

let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);

465+466+

for idx in 0..lane_count {

467+

let lane = a.value_lane(fx, idx).load_scalar(fx);

468+

let sat = fx.bcx.ins().smax(lane, zero);

469+

let sat = fx.bcx.ins().umin(sat, max_u8);

470+

let res = fx.bcx.ins().ireduce(types::I8, sat);

471+472+

let res_lane = CValue::by_val(res, ret_lane_layout);

473+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

474+

}

475+476+

for idx in 0..lane_count {

477+

let lane = b.value_lane(fx, idx).load_scalar(fx);

478+

let sat = fx.bcx.ins().smax(lane, zero);

479+

let sat = fx.bcx.ins().umin(sat, max_u8);

480+

let res = fx.bcx.ins().ireduce(types::I8, sat);

481+482+

let res_lane = CValue::by_val(res, ret_lane_layout);

483+

ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);

484+

}

485+

}

486+416487

_ => {

417488

fx.tcx

418489

.sess