From c48d2118eb9241432a6e55565d56492712697f85 Mon Sep 17 00:00:00 2001 From: Jason Conway Date: Sun, 25 Sep 2022 20:16:07 -0500 Subject: [PATCH] Implement FCMOVcc, cmpss, pminsw, and pmaxsw --- emu/decode.h | 16 ++++++++++++- emu/fpu.c | 17 ++++++++++++- emu/fpu.h | 9 +++++++ emu/vec.c | 31 ++++++++++++++++++++---- emu/vec.h | 4 +++- jit/gen.c | 8 +++++++ tests/e2e/qemu/expected.txt | 48 +++++++++++++++++++++++++++++++++++++ tests/e2e/qemu/qemu-test.c | 22 ++++++++--------- 8 files changed, 136 insertions(+), 19 deletions(-) diff --git a/emu/decode.h b/emu/decode.h index eb4add235b..7edf05f217 100644 --- a/emu/decode.h +++ b/emu/decode.h @@ -426,12 +426,16 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; V_OP(subss_b, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xe9: TRACEI("psubsw xmm:modrm, xmm"); READMODRM; V_OP(subss_w, xmm_modrm_val, xmm_modrm_reg,128); break; + case 0xea: TRACEI("pminsw xmm:modrm, xmm"); + READMODRM; V_OP(mins_w, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xeb: TRACEI("por xmm:modrm, xmm"); READMODRM; V_OP(or, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xec: TRACEI("paddsb xmm:modrm, xmm"); READMODRM; V_OP(addss_b, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xed: TRACEI("paddsw xmm:modrm, xmm"); READMODRM; V_OP(addss_w, xmm_modrm_val, xmm_modrm_reg,128); break; + case 0xee: TRACEI("pmaxsw xmm:modrm, xmm"); + READMODRM; V_OP(maxs_w, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xef: TRACEI("pxor xmm:modrm, xmm"); READMODRM; V_OP(xor, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xf3: TRACEI("psllq xmm:modrm, xmm"); @@ -927,6 +931,14 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { case 0xd87: TRACE("fdivr st(i), st"); FDIVR(st_i, st_0); break; case 0xd90: TRACE("fld st(i)"); FLD(); break; case 0xd91: TRACE("fxch st"); FXCH(); break; + case 0xda0: TRACE("fcmovb st, st(i)"); FCMOVB(st_i); break; + case 0xda1: TRACE("fcmove st, st(i)"); FCMOVE(st_i); break; + case 0xda2: TRACE("fcmovbe st, st(i)"); FCMOVBE(st_i); break; + case 0xda3: TRACE("fcmovu st, st(i)"); FCMOVU(st_i); break; + case 0xdb0: TRACE("fcmovnb st, st(i)"); FCMOVNB(st_i); break; + case 0xdb1: TRACE("fcmovne st, st(i)"); FCMOVNE(st_i); break; + case 0xdb2: TRACE("fcmovnbe st, st(i)"); FCMOVNBE(st_i); break; + case 0xdb3: TRACE("fcmovnu st, st(i)"); FCMOVNU(st_i); break; case 0xdb5: TRACE("fucomi st"); FUCOMI(); break; case 0xdb6: TRACE("fcomi st"); FCOMI(); break; case 0xdc0: TRACE("fadd st, st(i)"); FADD(st_0, st_i); break; @@ -1187,7 +1199,6 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; V_OP(single_fsub, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x5e: TRACEI("divss xmm:modrm, xmm"); READMODRM; V_OP(single_fdiv, xmm_modrm_val, xmm_modrm_reg,32); break; - case 0x6f: TRACEI("movdqu xmm:modrm, xmm"); READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break; @@ -1209,6 +1220,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { case 0xbd: TRACEI("~~lzcnt~~ bsr modrm, reg"); READMODRM; BSR(modrm_val, modrm_reg,oz); break; + case 0xc2: TRACEI("cmpss xmm:modrm, xmm, imm8"); + READMODRM; READIMM8; V_OP_IMM(single_fcmp, xmm_modrm_val, xmm_modrm_reg,32); break; + default: TRACE("undefined"); UNDEFINED; } break; diff --git a/emu/fpu.c b/emu/fpu.c index afaaa18a36..a58e88486b 100644 --- a/emu/fpu.c +++ b/emu/fpu.c @@ -64,7 +64,6 @@ void fpu_ldm80(struct cpu_state *cpu, float80 *f) { void fpu_st(struct cpu_state *cpu, int i) { ST(i) = ST(0); } - void fpu_ist16(struct cpu_state *cpu, int16_t *i) { int64_t res = f80_to_int(ST(0)); if (res < INT16_MIN || res > INT16_MAX) @@ -92,6 +91,22 @@ void fpu_stm80(struct cpu_state *cpu, float80 *f) { memcpy(f, &ST(0), 10); } +// moves + +#define FCMOVcc(instr, cond) \ + void fpu_cmov##instr(struct cpu_state *cpu, int i) { \ + if (cond) \ + ST(0) = ST(i); \ + } +FCMOVcc(b, cpu->cf) +FCMOVcc(e, cpu->zf) +FCMOVcc(be, cpu->cf | cpu->zf) +FCMOVcc(u, cpu->pf) +FCMOVcc(nb, !cpu->cf) +FCMOVcc(ne, !cpu->zf) +FCMOVcc(nbe, !(cpu->cf | cpu->zf)) +FCMOVcc(nu, !cpu->pf) + // math void fpu_prem(struct cpu_state *cpu) { diff --git a/emu/fpu.h b/emu/fpu.h index 8b491e22e2..7cdf6349e4 100644 --- a/emu/fpu.h +++ b/emu/fpu.h @@ -39,6 +39,15 @@ void fpu_stm32(struct cpu_state *cpu, float *f); void fpu_stm64(struct cpu_state *cpu, double *f); void fpu_stm80(struct cpu_state *cpu, float80 *f); +void fpu_cmovb(struct cpu_state *cpu, int i); +void fpu_cmove(struct cpu_state *cpu, int i); +void fpu_cmovbe(struct cpu_state *cpu, int i); +void fpu_cmovu(struct cpu_state *cpu, int i); +void fpu_cmovnb(struct cpu_state *cpu, int i); +void fpu_cmovne(struct cpu_state *cpu, int i); +void fpu_cmovnbe(struct cpu_state *cpu, int i); +void fpu_cmovnu(struct cpu_state *cpu, int i); + void fpu_ld(struct cpu_state *cpu, int i); void fpu_ldc(struct cpu_state *cpu, enum fpu_const c); void fpu_ild16(struct cpu_state *cpu, int16_t *i); diff --git a/emu/vec.c b/emu/vec.c index 502450380a..0fb2e30b34 100644 --- a/emu/vec.c +++ b/emu/vec.c @@ -34,8 +34,7 @@ static inline uint32_t satub(uint32_t dw) { dw = 0xff; return dw; } -static inline uint32_t satsb(uint32_t dw) -{ +static inline uint32_t satsb(uint32_t dw) { if (dw > 0xffffff80) dw &= 0xff; else if (dw > 0x7fffffff) @@ -327,12 +326,20 @@ void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) { if (src->u8[i] < dst->u8[i]) dst->u8[i] = src->u8[i]; } - void vec_max_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) { for (unsigned i = 0; i < array_size(src->u8); i++) if (src->u8[i] > dst->u8[i]) dst->u8[i] = src->u8[i]; } +void vec_mins_w128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) { + for (unsigned i = 0; i < 8; i++) + dst->u16[i] = (int16_t)dst->u16[i] < (int16_t)src->u16[i] ? dst->u16[i] : src->u16[i]; +} + +void vec_maxs_w128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) { + for (unsigned i = 0; i < 8; i++) + dst->u16[i] = (int16_t)dst->u16[i] > (int16_t)src->u16[i] ? dst->u16[i] : src->u16[i]; +} static bool cmpd(double a, double b, int type) { bool res; @@ -345,10 +352,24 @@ static bool cmpd(double a, double b, int type) { if (type >= 4) res = !res; return res; } +static bool cmps(float a, float b, int type) { + bool res; + switch (type % 4) { + case 0: res = a == b; break; + case 1: res = a < b; break; + case 2: res = a <= b; break; + case 3: res = isnan(a) || isnan(b); break; + } + if (type >= 4) res = !res; + return res; +} void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type) { dst->qw[0] = cmpd(dst->f64[0], *src, type) ? -1 : 0; } +void vec_single_fcmp32(NO_CPU, const float *src, union xmm_reg *dst, uint8_t type) { + dst->u32[0] = cmps(dst->f32[0], *src, type) ? -1 : 0; +} void vec_single_fadd64(NO_CPU, const double *src, double *dst) { *dst += *src; } void vec_single_fadd32(NO_CPU, const float *src, float *dst) { *dst += *src; } @@ -606,11 +627,11 @@ void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t i } void vec_avg_b128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) { - for(unsigned i = 0; i < 16; i++) + for (unsigned i = 0; i < 16; i++) dst->u8[i] = (1 + dst->u8[i] + src->u8[i]) >> 1; } void vec_avg_w128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) { - for(unsigned i = 0; i < 8; i++) + for (unsigned i = 0; i < 8; i++) dst->u16[i] = (1 + dst->u16[i] + src->u16[i]) >> 1; } diff --git a/emu/vec.h b/emu/vec.h index 83f12fda41..51c3e5ff60 100644 --- a/emu/vec.h +++ b/emu/vec.h @@ -74,7 +74,9 @@ void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); void vec_xor64(NO_CPU, union mm_reg *src, union mm_reg *dst); void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); +void vec_mins_w128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); void vec_max_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); +void vec_maxs_w128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); void vec_single_fadd64(NO_CPU, const double *src, double *dst); void vec_single_fadd32(NO_CPU, const float *src, float *dst); @@ -91,7 +93,7 @@ void vec_single_fmin64(NO_CPU, const double *src, double *dst); void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst); void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst); void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type); - +void vec_single_fcmp32(NO_CPU, const float *src, union xmm_reg *dst, uint8_t type); void vec_fcmp_p64(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t type); void vec_cvtsi2sd32(NO_CPU, const int32_t *src, double *dst); diff --git a/jit/gen.c b/jit/gen.c index 577de3a015..5d33a2ad22 100644 --- a/jit/gen.c +++ b/jit/gen.c @@ -437,6 +437,14 @@ void helper_rdtsc(struct cpu_state *cpu); #define FSIN() h(fpu_sin) #define FCOS() h(fpu_cos) #define FXTRACT() h(fpu_xtract) +#define FCMOVB(src) hh(fpu_cmovb, src) +#define FCMOVE(src) hh(fpu_cmove, src) +#define FCMOVBE(src) hh(fpu_cmovbe, src) +#define FCMOVU(src) hh(fpu_cmovu, src) +#define FCMOVNB(src) hh(fpu_cmovnb, src) +#define FCMOVNE(src) hh(fpu_cmovne, src) +#define FCMOVNBE(src) hh(fpu_cmovnbe, src) +#define FCMOVNU(src) hh(fpu_cmovnu, src) // vector diff --git a/tests/e2e/qemu/expected.txt b/tests/e2e/qemu/expected.txt index 80df696ece..340d49b1a2 100644 --- a/tests/e2e/qemu/expected.txt +++ b/tests/e2e/qemu/expected.txt @@ -4206,6 +4206,34 @@ fldpi= 3.141593 fldlg2= 0.301030 fldln2= 0.693147 fldz= 0.000000 +fcmovb eflags=0x0000-> 1.000000 +fcmove eflags=0x0000-> 1.000000 +fcmovbe eflags=0x0000-> 1.000000 +fcmovnb eflags=0x0000-> 2.000000 +fcmovne eflags=0x0000-> 2.000000 +fcmovnbe eflags=0x0000-> 2.000000 +fcmovb eflags=0x0001-> 2.000000 +fcmove eflags=0x0001-> 1.000000 +fcmovbe eflags=0x0001-> 2.000000 +fcmovnb eflags=0x0001-> 1.000000 +fcmovne eflags=0x0001-> 2.000000 +fcmovnbe eflags=0x0001-> 1.000000 +fcmovb eflags=0x0040-> 1.000000 +fcmove eflags=0x0040-> 2.000000 +fcmovbe eflags=0x0040-> 2.000000 +fcmovnb eflags=0x0040-> 2.000000 +fcmovne eflags=0x0040-> 1.000000 +fcmovnbe eflags=0x0040-> 1.000000 +fcmovb eflags=0x0041-> 2.000000 +fcmove eflags=0x0041-> 2.000000 +fcmovbe eflags=0x0041-> 2.000000 +fcmovnb eflags=0x0041-> 1.000000 +fcmovne eflags=0x0041-> 1.000000 +fcmovnbe eflags=0x0041-> 1.000000 +fcmovu eflags=0x0000-> 1.000000 +fcmovu eflags=0x0004-> 2.000000 +fcmovnu eflags=0x0000-> 2.000000 +fcmovnu eflags=0x0004-> 1.000000 xchgl A=fbca7654 B=12345678 xchgw A=12347654 B=fbca5678 xchgb A=12345654 B=fbca7678 @@ -4418,12 +4446,16 @@ psubsb : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab psubsb : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=61e80000091e00f3f1063d8000000000 psubsw : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=9a5f3e04ea6777a6263ea6f9800070c8 psubsw : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=60e80000091efff3f1063d6800000000 +pminsw : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=dc511efb944ae1461f2923c69869d7ab +pminsw : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=c233e9e8c4c9438d007c255a085427f8 por : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=ddf35effbdebf9ee5f6f7fcfd8fbdffb por : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=e33be9e8cdef439f0f7e67da085427f8 paddsb : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=1d437afa802d3932647f7f93f0231f1e paddsb : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=e54ed2d091b07f800f7f7f1c107f4ef0 paddsw : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=1e437bfa80003a3264907ffff123201e paddsw : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=e54ed3d092b07fff0ff27fff10a84ff0 +pmaxsw : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=41f25cffa9e358ec45677ccd58ba4873 +pmaxsw : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=231be9e8cde7439a0f7662c2085427f8 pxor : a=456723c698694873 b=1f297ccd58bad7ab r=5a4e5f0bc0d39fd8 pxor : a=007c62c2085427f8 b=0f76255a085427f8 r=0f0a479800000000 pxor : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=9da342043da9b9aa5a4e5f0bc0d39fd8 @@ -4581,6 +4613,14 @@ mulss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd subps : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c27a666600000000c3af0000c22c0000 subss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999ac22c0000 divss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999a3d71fee1 +cmpeqss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpltss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpless : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpunordss: a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpneqss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpnltss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpnless : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpordss : a=c0c9999a408000004059999a402ccccd b=426133334080000043b0b3334236cccd r=c0c9999a408000004059999affffffff addpd : a=c00b333333333333400599999999999a b=c04ab333333333334046d9999999999a r=c04c6666666666664048333333333334 addsd : a=c00b333333333333400599999999999a b=c04ab333333333334046d9999999999a r=c00b3333333333334048333333333334 mulpd : a=c00b333333333333400599999999999a b=c04ab333333333334046d9999999999a r=4066b1eb851eb852405ed8f5c28f5c2a @@ -4614,6 +4654,14 @@ mulss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd subps : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=ffc0000000000000c3af0000ffc00000 subss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affc00000 divss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affc00000 +cmpeqss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpltss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpless : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 +cmpunordss: a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpneqss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpnltss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpnless : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999affffffff +cmpordss : a=c0c9999a408000004059999affc00000 b=ffc000004080000043b0b3334236cccd r=c0c9999a408000004059999a00000000 addpd : a=c00b333333333333fff8000000000000 b=fff80000000000004046d9999999999a r=fff8000000000000fff8000000000000 addsd : a=c00b333333333333fff8000000000000 b=fff80000000000004046d9999999999a r=c00b333333333333fff8000000000000 mulpd : a=c00b333333333333fff8000000000000 b=fff80000000000004046d9999999999a r=fff8000000000000fff8000000000000 diff --git a/tests/e2e/qemu/qemu-test.c b/tests/e2e/qemu/qemu-test.c index 8cfd87f009..76e4393e5b 100644 --- a/tests/e2e/qemu/qemu-test.c +++ b/tests/e2e/qemu/qemu-test.c @@ -1035,7 +1035,7 @@ void test_floats(void) //test_fbcd(-123451234567890.0); //test_fenv(); if (TEST_CMOV) { - //test_fcmov(); + test_fcmov(); } } @@ -2466,11 +2466,11 @@ void test_sse(void) SSE_OP2(psubsb); SSE_OP2(psubsw); - // MMX_OP2(pminsw); + SSE_OP2(pminsw); SSE_OP2(por); SSE_OP2(paddsb); SSE_OP2(paddsw); - // MMX_OP2(pmaxsw); + SSE_OP2(pmaxsw); MMX_OP2(pxor); MMX_OP2(pmuludq); SSE_OP2(pmaddwd); @@ -2625,14 +2625,14 @@ void test_sse(void) SSE_OPS_S(div); // SSE_OPS(max); // SSE_OPS(sqrt); - // SSE_OPS(cmpeq); - // SSE_OPS(cmplt); - // SSE_OPS(cmple); - // SSE_OPS(cmpunord); - // SSE_OPS(cmpneq); - // SSE_OPS(cmpnlt); - // SSE_OPS(cmpnle); - // SSE_OPS(cmpord); + SSE_OPS_S(cmpeq); + SSE_OPS_S(cmplt); + SSE_OPS_S(cmple); + SSE_OPS_S(cmpunord); + SSE_OPS_S(cmpneq); + SSE_OPS_S(cmpnlt); + SSE_OPS_S(cmpnle); + SSE_OPS_S(cmpord); a.d[0] = 2.7;