Implement all vendor intrinsics used by the simd-json crate · rust-lang/rust@8649731

@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

590590

}

591591

}

592592593+

"llvm.x86.sse41.packusdw" => {

594+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912

595+

intrinsic_args!(fx, args => (a, b); intrinsic);

596+597+

assert_eq!(a.layout(), b.layout());

598+

let layout = a.layout();

599+600+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

601+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

602+

assert_eq!(lane_ty, fx.tcx.types.i32);

603+

assert_eq!(ret_lane_ty, fx.tcx.types.u16);

604+

assert_eq!(lane_count * 2, ret_lane_count);

605+606+

let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));

607+

let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));

608+

let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);

609+610+

for idx in 0..lane_count {

611+

let lane = a.value_lane(fx, idx).load_scalar(fx);

612+

let sat = fx.bcx.ins().umax(lane, min_u16);

613+

let sat = fx.bcx.ins().umin(sat, max_u16);

614+

let res = fx.bcx.ins().ireduce(types::I16, sat);

615+616+

let res_lane = CValue::by_val(res, ret_lane_layout);

617+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

618+

}

619+620+

for idx in 0..lane_count {

621+

let lane = b.value_lane(fx, idx).load_scalar(fx);

622+

let sat = fx.bcx.ins().umax(lane, min_u16);

623+

let sat = fx.bcx.ins().umin(sat, max_u16);

624+

let res = fx.bcx.ins().ireduce(types::I16, sat);

625+626+

let res_lane = CValue::by_val(res, ret_lane_layout);

627+

ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);

628+

}

629+

}

630+593631

"llvm.x86.avx2.packssdw" => {

594632

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892

595633

intrinsic_args!(fx, args => (a, b); intrinsic);

@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

648686

}

649687

}

650688689+

"llvm.x86.pclmulqdq" => {

690+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772

691+

intrinsic_args!(fx, args => (a, b, imm8); intrinsic);

692+693+

assert_eq!(a.layout(), b.layout());

694+

let layout = a.layout();

695+696+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

697+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

698+

assert_eq!(lane_ty, fx.tcx.types.i64);

699+

assert_eq!(ret_lane_ty, fx.tcx.types.i64);

700+

assert_eq!(lane_count, 2);

701+

assert_eq!(ret_lane_count, 2);

702+703+

let imm8 = imm8.load_scalar(fx);

704+705+

let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);

706+

let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);

707+

let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);

708+

let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);

709+710+

let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);

711+

let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);

712+

let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);

713+

let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);

714+715+

fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {

716+

let tmp = fx.bcx.ins().ushr_imm(val, bit);

717+

fx.bcx.ins().band_imm(tmp, 1)

718+

}

719+720+

let mut res1 = fx.bcx.ins().iconst(types::I64, 0);

721+

for i in 0..=63 {

722+

let x = extract_bit(fx, temp1, 0);

723+

let y = extract_bit(fx, temp2, i);

724+

let mut temp = fx.bcx.ins().band(x, y);

725+

for j in 1..=i {

726+

let x = extract_bit(fx, temp1, j);

727+

let y = extract_bit(fx, temp2, i - j);

728+

let z = fx.bcx.ins().band(x, y);

729+

temp = fx.bcx.ins().bxor(temp, z);

730+

}

731+

let temp = fx.bcx.ins().ishl_imm(temp, i);

732+

res1 = fx.bcx.ins().bor(res1, temp);

733+

}

734+

ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());

735+736+

let mut res2 = fx.bcx.ins().iconst(types::I64, 0);

737+

for i in 64..=127 {

738+

let mut temp = fx.bcx.ins().iconst(types::I64, 0);

739+

for j in i - 63..=63 {

740+

let x = extract_bit(fx, temp1, j);

741+

let y = extract_bit(fx, temp2, i - j);

742+

let z = fx.bcx.ins().band(x, y);

743+

temp = fx.bcx.ins().bxor(temp, z);

744+

}

745+

let temp = fx.bcx.ins().ishl_imm(temp, i);

746+

res2 = fx.bcx.ins().bor(res2, temp);

747+

}

748+

ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());

749+

}

750+751+

"llvm.x86.avx.ptestz.256" => {

752+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945

753+

intrinsic_args!(fx, args => (a, b); intrinsic);

754+755+

assert_eq!(a.layout(), b.layout());

756+

let layout = a.layout();

757+758+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

759+

assert_eq!(lane_ty, fx.tcx.types.i64);

760+

assert_eq!(ret.layout().ty, fx.tcx.types.i32);

761+

assert_eq!(lane_count, 4);

762+763+

let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);

764+

let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);

765+

let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);

766+

let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);

767+

let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);

768+

let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);

769+

let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);

770+

let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);

771+772+

let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);

773+

let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);

774+

let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);

775+

let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);

776+777+

let all_zero0 = fx.bcx.ins().bor(zero0, zero1);

778+

let all_zero1 = fx.bcx.ins().bor(zero2, zero3);

779+

let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);

780+781+

let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);

782+

let res = CValue::by_val(

783+

fx.bcx.ins().uextend(types::I32, res),

784+

fx.layout_of(fx.tcx.types.i32),

785+

);

786+

ret.write_cvalue(fx, res);

787+

}

788+651789

_ => {

652790

fx.tcx

653791

.sess