Skip to content

Commit

Permalink
Add support for fmsub and fnmsub instructions on x86 (bytecodeallianc…
Browse files Browse the repository at this point in the history
…e#8888)

* add fmsub and fnmsub instructions

* write tests for fmsub and fnmsub

* add a single runtest

* add more tests

* fix fmnsub_f32 test

* add reference to issue bytecodealliance#8953
  • Loading branch information
UnlimitedHummus authored Jul 13, 2024
1 parent 99b739f commit 225d20e
Show file tree
Hide file tree
Showing 6 changed files with 890 additions and 1 deletion.
48 changes: 48 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,22 @@
Vfnmadd132sd
Vfnmadd132ps
Vfnmadd132pd
Vfmsub213ss
Vfmsub213sd
Vfmsub213ps
Vfmsub213pd
Vfmsub132ss
Vfmsub132sd
Vfmsub132ps
Vfmsub132pd
Vfnmsub213ss
Vfnmsub213sd
Vfnmsub213ps
Vfnmsub213pd
Vfnmsub132ss
Vfnmsub132sd
Vfnmsub132ps
Vfnmsub132pd
Vcmpps
Vcmppd
Vpsrlw
Expand Down Expand Up @@ -4371,6 +4387,38 @@
(rule (x64_vfnmadd132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132ps) a b c))
(rule (x64_vfnmadd132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmadd132pd) a b c))

;; Helper for creating `vfmsub213*` instructions
(decl x64_vfmsub213 (Type Xmm Xmm XmmMem) Xmm)
(rule (x64_vfmsub213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub213ss) a b c))
(rule (x64_vfmsub213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub213sd) a b c))
(rule (x64_vfmsub213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub213ps) a b c))
(rule (x64_vfmsub213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub213pd) a b c))

;; Helper for creating `vfmsub132*` instructions
(decl x64_vfmsub132 (Type Xmm Xmm XmmMem) Xmm)
(rule (x64_vfmsub132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub132ss) a b c))
(rule (x64_vfmsub132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub132sd) a b c))
(rule (x64_vfmsub132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub132ps) a b c))
(rule (x64_vfmsub132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfmsub132pd) a b c))

;; Helper for creating `vfnmsub213*` instructions
(decl x64_vfnmsub213 (Type Xmm Xmm XmmMem) Xmm)
(rule (x64_vfnmsub213 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub213ss) a b c))
(rule (x64_vfnmsub213 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub213sd) a b c))
(rule (x64_vfnmsub213 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub213ps) a b c))
(rule (x64_vfnmsub213 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub213pd) a b c))

;; Helper for creating `vfnmsub132*` instructions
(decl x64_vfnmsub132 (Type Xmm Xmm XmmMem) Xmm)
(rule (x64_vfnmsub132 $F32 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub132ss) a b c))
(rule (x64_vfnmsub132 $F64 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub132sd) a b c))
(rule (x64_vfnmsub132 $F32X4 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub132ps) a b c))
(rule (x64_vfnmsub132 $F64X2 a b c) (xmm_rmr_vex3 (AvxOpcode.Vfnmsub132pd) a b c))

;; Note, the `vfmsub231` and `vfnmsub231*` instructions are omitted, because
;; instruction selection happens before register allocation and therefore there
;; is no benefit to a a third permutation

;; Helper for creating `sqrtss` instructions.
;;
;; NB: the square-root operation technically only has one operand but this
Expand Down
18 changes: 17 additions & 1 deletion cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1565,7 +1565,23 @@ impl AvxOpcode {
| AvxOpcode::Vfnmadd132ss
| AvxOpcode::Vfnmadd132sd
| AvxOpcode::Vfnmadd132ps
| AvxOpcode::Vfnmadd132pd => smallvec![InstructionSet::FMA],
| AvxOpcode::Vfnmadd132pd
| AvxOpcode::Vfmsub213ss
| AvxOpcode::Vfmsub213sd
| AvxOpcode::Vfmsub213ps
| AvxOpcode::Vfmsub213pd
| AvxOpcode::Vfmsub132ss
| AvxOpcode::Vfmsub132sd
| AvxOpcode::Vfmsub132ps
| AvxOpcode::Vfmsub132pd
| AvxOpcode::Vfnmsub213ss
| AvxOpcode::Vfnmsub213sd
| AvxOpcode::Vfnmsub213ps
| AvxOpcode::Vfnmsub213pd
| AvxOpcode::Vfnmsub132ss
| AvxOpcode::Vfnmsub132sd
| AvxOpcode::Vfnmsub132ps
| AvxOpcode::Vfnmsub132pd => smallvec![InstructionSet::FMA],
AvxOpcode::Vminps
| AvxOpcode::Vminpd
| AvxOpcode::Vmaxps
Expand Down
16 changes: 16 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2580,6 +2580,22 @@ pub(crate) fn emit(
AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
AvxOpcode::Vfmsub132ss => (false, OpcodeMap::_0F38, 0x9B),
AvxOpcode::Vfmsub213ss => (false, OpcodeMap::_0F38, 0xAB),
AvxOpcode::Vfnmsub132ss => (false, OpcodeMap::_0F38, 0x9F),
AvxOpcode::Vfnmsub213ss => (false, OpcodeMap::_0F38, 0xAF),
AvxOpcode::Vfmsub132sd => (true, OpcodeMap::_0F38, 0x9B),
AvxOpcode::Vfmsub213sd => (true, OpcodeMap::_0F38, 0xAB),
AvxOpcode::Vfnmsub132sd => (true, OpcodeMap::_0F38, 0x9F),
AvxOpcode::Vfnmsub213sd => (true, OpcodeMap::_0F38, 0xAF),
AvxOpcode::Vfmsub132ps => (false, OpcodeMap::_0F38, 0x9A),
AvxOpcode::Vfmsub213ps => (false, OpcodeMap::_0F38, 0xAA),
AvxOpcode::Vfnmsub132ps => (false, OpcodeMap::_0F38, 0x9E),
AvxOpcode::Vfnmsub213ps => (false, OpcodeMap::_0F38, 0xAE),
AvxOpcode::Vfmsub132pd => (true, OpcodeMap::_0F38, 0x9A),
AvxOpcode::Vfmsub213pd => (true, OpcodeMap::_0F38, 0xAA),
AvxOpcode::Vfnmsub132pd => (true, OpcodeMap::_0F38, 0x9E),
AvxOpcode::Vfnmsub213pd => (true, OpcodeMap::_0F38, 0xAE),
AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
Expand Down
32 changes: 32 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -2860,6 +2860,38 @@
(rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z))
(rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z))


(rule 2 (lower (has_type ty (fma x y (fneg z))))
(if-let $true (use_fma))
(fmsub ty x y z))

;; fmsub and fnmsub
(decl fmsub (Type Value Value Value) Xmm)
(decl fnmsub (Type Value Value Value) Xmm)

;; Base case, will sink a load of `z` automatically.
(rule (fmsub ty x y z) (x64_vfmsub213 ty x y z))

;; Allow sinking loads with one of the two values being multiplied in addition
;; to the value being subtracted. Note that both x and y can be sunk here due to
;; multiplication being commutative.
(rule 1 (fmsub ty (sinkable_load x) y z) (x64_vfmsub132 ty y z x))
(rule 2 (fmsub ty x (sinkable_load y) z) (x64_vfmsub132 ty x z y))

;; If one of the values being multiplied is negated then use a `vfnmsub*`
;; instruction instead
(rule 3 (fmsub ty (fneg x) y z) (fnmsub ty x y z))
(rule 4 (fmsub ty x (fneg y) z) (fnmsub ty x y z))

(rule (fnmsub ty x y z) (x64_vfnmsub213 ty x y z))
(rule 1 (fnmsub ty (sinkable_load x) y z) (x64_vfnmsub132 ty y z x))
(rule 2 (fnmsub ty x (sinkable_load y) z) (x64_vfnmsub132 ty x z y))

;; Like `fmsub` if one argument is negated switch which one is being codegen'd
(rule 3 (fnmsub ty (fneg x) y z) (fmsub ty x y z))
(rule 4 (fnmsub ty x (fneg y) z) (fmsub ty x y z))


;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; In order to load a value from memory to a GPR register, we may need to extend
Expand Down
Loading

0 comments on commit 225d20e

Please sign in to comment.