Skip to content

Commit

Permalink
Add a crapton of SSE instructions
Browse files Browse the repository at this point in the history
mfence, ucomisd, comisd, movmskpd, andpd, punpckldq, psubq, comiss,
andps, andnps, maxsd, cmpsd. Enough for a node repl.

ish-app#90
  • Loading branch information
tbodt committed Jun 9, 2020
1 parent 8d84030 commit 55816c8
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 122 deletions.
36 changes: 31 additions & 5 deletions emu/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
case 0x29: TRACEI("movaps xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;

case 0x2e: TRACEI("ucomiss xmm, xmm:modrm");
READMODRM; VCOMPARE(xmm_modrm_val, xmm_modrm_reg,32);
break;

case 0x31: TRACEI("rdtsc");
RDTSC; break;

Expand Down Expand Up @@ -190,6 +186,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
case 0xad: TRACEI("shrd cl, reg, modrm");
READMODRM; SHRD(reg_c, modrm_reg, modrm_val,oz); break;

case 0xae: TRACEI("fence"); READMODRM; break;

case 0xaf: TRACEI("imul modrm, reg");
READMODRM; IMUL2(modrm_val, modrm_reg,oz); break;

Expand Down Expand Up @@ -264,10 +262,22 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
#endif

#if OP_SIZE == 16
case 0x2e: TRACEI("ucomisd xmm, xmm:modrm");
READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,64); break;
case 0x2f: TRACEI("comisd xmm, xmm:modrm");
READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,64); break;

case 0x50: TRACEI("movmskpd xmm:modrm, reg");
READMODRM; V_OP(fmovmask_d, xmm_modrm_val, modrm_reg,128); break;

case 0x54: TRACEI("andpd xmm:modrm, xmm");
READMODRM; V_OP(and, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x56: TRACEI("orpd xmm:modrm, xmm");
READMODRM; V_OP(or, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x60: TRACEI("punpcklbw xmm:modrm, xmm");
READMODRM; V_OP(unpack_bw, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x62: TRACEI("punpckldq xmm:modrm, xmm");
READMODRM; V_OP(unpack_dq, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x6c: TRACEI("punpcklqdq xmm:modrm, xmm");
READMODRM; V_OP(unpack_qdq, xmm_modrm_val, xmm_modrm_reg,128); break;

Expand Down Expand Up @@ -326,7 +336,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
case 0xef: TRACEI("pxor xmm:modrm, xmm");
READMODRM; V_OP(xor, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0xf3: TRACEI("psllq xmm:modrm, xmm");
READMODRM; V_OP(shiftl_q, xmm_modrm_val, xmm_modrm_reg, 128); break;
READMODRM; V_OP(shiftl_q, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0xfb: TRACEI("psubq xmm:modrm, xmm");
READMODRM; V_OP(sub_q, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0xfc: TRACEI("paddb xmm:modrm, xmm");
READMODRM; V_OP(add_b, xmm_modrm_val, xmm_modrm_reg,128); break;
#else
Expand All @@ -335,6 +347,15 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
case 0x11: TRACEI("movups xmm, xmm:modrm");
READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;

case 0x2e: TRACEI("ucomiss xmm, xmm:modrm");
READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,32); break;
case 0x2f: TRACEI("comiss xmm, xmm:modrm");
READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,32); break;

case 0x54: TRACEI("andps xmm:modrm, xmm");
READMODRM; V_OP(and, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x55: TRACEI("andnps xmm:modrm, xmm");
READMODRM; V_OP(andn, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x56: TRACEI("orps xmm:modrm, xmm");
READMODRM; V_OP(or, xmm_modrm_val, xmm_modrm_reg,128); break;
case 0x57: TRACEI("xorps xmm:modrm, xmm");
Expand Down Expand Up @@ -940,10 +961,15 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
READMODRM; V_OP(single_fsub, xmm_modrm_val, xmm_modrm_reg,64); break;
case 0x5e: TRACEI("divsd xmm:modrm, xmm");
READMODRM; V_OP(single_fdiv, xmm_modrm_val, xmm_modrm_reg,64); break;
case 0x5f: TRACEI("maxsd xmm:modrm, xmm");
READMODRM; V_OP(single_fmax, xmm_modrm_val, xmm_modrm_reg,64); break;

case 0x70: TRACEI("pshuflw xmm:modrm, xmm, imm8");
READMODRM; READIMM8; V_OP_IMM(shuffle_lw, xmm_modrm_val, xmm_modrm_reg,128); break;

case 0xc2: TRACEI("cmpsd xmm:modrm, xmm, imm8");
READMODRM; READIMM8; V_OP_IMM(single_fcmp, xmm_modrm_val, xmm_modrm_reg,64); break;

case 0x18 ... 0x1f: TRACEI("rep nop modrm\t"); READMODRM; break;
default: TRACE("undefined"); UNDEFINED;
}
Expand Down
116 changes: 76 additions & 40 deletions emu/vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,6 @@
#include "emu/vec.h"
#include "emu/cpu.h"

void vec_compare32(struct cpu_state *cpu, float *f2, float *f1) {
if (isnan(*f1) || isnan(*f2)) {
cpu->zf = 1;
cpu->pf = 1;
cpu->cf = 1;
}
else if (*f1 > *f2) {
cpu->zf = 0;
cpu->pf = 0;
cpu->cf = 0;
}
else if (*f1 < *f2) {
cpu->zf = 0;
cpu->pf = 0;
cpu->cf = 1;
}
else if (*f1 == *f2) {
cpu->zf = 1;
cpu->pf = 0;
cpu->cf = 0;
}
else {
printf("something's horribly wrong. err 1093281094");
}
cpu->zf_res = 0;
cpu->pf_res = 0;
}

static inline void zero_xmm(union xmm_reg *xmm) {
xmm->qw[0] = 0;
xmm->qw[1] = 0;
Expand Down Expand Up @@ -98,6 +70,19 @@ void vec_shiftr_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst) {
}
}

void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
for (unsigned i = 0; i < array_size(src->u8); i++)
dst->u8[i] += src->u8[i];
}
void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] += src->qw[0];
dst->qw[1] += src->qw[1];
}
void vec_sub_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] -= src->qw[0];
dst->qw[1] -= src->qw[1];
}

void vec_and128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] &= src->qw[0];
dst->qw[1] &= src->qw[1];
Expand All @@ -110,14 +95,9 @@ void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] ^= src->qw[0];
dst->qw[1] ^= src->qw[1];
}

void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
for (unsigned i = 0; i < array_size(src->u8); i++)
dst->u8[i] += src->u8[i];
}
void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] += src->qw[0];
dst->qw[1] += src->qw[1];
void vec_andn128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->qw[0] = ~dst->qw[0] & src->qw[0];
dst->qw[1] = ~dst->qw[1] & src->qw[1];
}

void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
Expand All @@ -126,6 +106,22 @@ void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
dst->u8[i] = src->u8[i];
}

static bool cmpd(double a, double b, int type) {
bool res;
switch (type % 4) {
case 0: res = a == b; break;
case 1: res = a < b; break;
case 2: res = a <= b; break;
case 3: res = isnan(a) || isnan(b); break;
}
if (type >= 4) res = !res;
return res;
}

void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type) {
dst->qw[0] = cmpd(dst->f64[0], *src, type) ? -1 : 0;
}

void vec_single_fadd64(NO_CPU, const double *src, double *dst) { *dst += *src; }
void vec_single_fmul64(NO_CPU, const double *src, double *dst) { *dst *= *src; }
void vec_single_fsub64(NO_CPU, const double *src, double *dst) { *dst -= *src; }
Expand All @@ -135,15 +131,42 @@ void vec_single_fmul32(NO_CPU, const float *src, float *dst) { *dst *= *src; }
void vec_single_fsub32(NO_CPU, const float *src, float *dst) { *dst -= *src; }
void vec_single_fdiv32(NO_CPU, const float *src, float *dst) { *dst /= *src; }

void vec_single_fmax64(NO_CPU, const double *src, double *dst) {
if (*src > *dst || isnan(*src) || isnan(*dst)) *dst = *src;
}

void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst) {
cpu->zf_res = cpu->pf_res = 0;
cpu->zf = *src == *dst;
cpu->cf = *src > *dst;
cpu->pf = 0;
if (isnan(*src) || isnan(*dst))
cpu->zf = cpu->cf = cpu->pf = 1;
cpu->of = cpu->sf = cpu->af = 0;
cpu->sf_res = 0;
}

void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst) {
cpu->zf_res = cpu->pf_res = 0;
cpu->zf = *src == *dst;
cpu->cf = *src > *dst;
cpu->pf = 0;
if (isnan(*src) || isnan(*dst))
cpu->zf = cpu->cf = cpu->pf = 1;
cpu->of = cpu->sf = cpu->af = 0;
cpu->sf_res = 0;
}

// TODO float edge cases e.g. nan
#define VEC_CVT(name, src_t, dst_t) \
void vec_cvt##name(NO_CPU, const src_t *src, dst_t *dst) { \
*dst = *src; \
}
VEC_CVT(si2sd32, uint32_t, double)
VEC_CVT(tsd2si64, double, uint32_t)
VEC_CVT(si2sd32, int32_t, double)
VEC_CVT(tsd2si64, double, int32_t)
VEC_CVT(sd2ss64, double, float)
VEC_CVT(si2ss32, uint32_t, float)
VEC_CVT(tss2si32, float, uint32_t)
VEC_CVT(si2ss32, int32_t, float)
VEC_CVT(tss2si32, float, int32_t)
VEC_CVT(ss2sd32, float, double)

void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
Expand All @@ -152,6 +175,11 @@ void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
dst->u8[i*2] = dst->u8[i];
}
}
void vec_unpack_dq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
dst->u32[3] = src->u32[1];
dst->u32[2] = dst->u32[1];
dst->u32[1] = src->u32[0];
}
void vec_unpack_qdq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
dst->qw[1] = src->qw[0];
}
Expand Down Expand Up @@ -185,6 +213,14 @@ void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst) {
}
}

void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst) {
*dst = 0;
for (unsigned i = 0; i < array_size(src->f64); i++) {
if (signbit(src->f64[i]))
*dst |= 1 << i;
}
}

void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index) {
*dst = src->u16[index % 8];
}
23 changes: 16 additions & 7 deletions emu/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "emu/cpu.h"

#define NO_CPU struct cpu_state *UNUSED(cpu)
void vec_compare32(NO_CPU, float *f2, float *f1);

// arguments are in src, dst order

Expand All @@ -22,14 +21,17 @@ void vec_imm_shiftl_q128(NO_CPU, const uint8_t amount, union xmm_reg *dst);
void vec_imm_shiftr_q128(NO_CPU, const uint8_t amount, union xmm_reg *dst);
void vec_shiftl_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst);
void vec_shiftr_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst);
void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_sub_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_and128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_andn128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_or128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);

void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);


void vec_single_fadd64(NO_CPU, const double *src, double *dst);
void vec_single_fmul64(NO_CPU, const double *src, double *dst);
void vec_single_fsub64(NO_CPU, const double *src, double *dst);
Expand All @@ -39,21 +41,28 @@ void vec_single_fmul32(NO_CPU, const float *src, float *dst);
void vec_single_fsub32(NO_CPU, const float *src, float *dst);
void vec_single_fdiv32(NO_CPU, const float *src, float *dst);

void vec_cvtsi2sd32(NO_CPU, const uint32_t *src, double *dst);
void vec_cvttsd2si64(NO_CPU, const double *src, uint32_t *dst);
void vec_single_fmax64(NO_CPU, const double *src, double *dst);
void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst);
void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst);
void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type);

void vec_cvtsi2sd32(NO_CPU, const int32_t *src, double *dst);
void vec_cvttsd2si64(NO_CPU, const double *src, int32_t *dst);
void vec_cvtsd2ss64(NO_CPU, const double *src, float *dst);
void vec_cvtsi2ss32(NO_CPU, const uint32_t *src, float *dst);
void vec_cvttss2si32(NO_CPU, const float *src, uint32_t *dst);
void vec_cvtsi2ss32(NO_CPU, const int32_t *src, float *dst);
void vec_cvttss2si32(NO_CPU, const float *src, int32_t *dst);
void vec_cvtss2sd32(NO_CPU, const float *src, double *dst);

// TODO organize
void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_unpack_dq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_unpack_qdq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_shuffle_lw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
void vec_shuffle_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
void vec_compare_eqb128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_compare_eqd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst);
void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst);
void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index);

#endif
Loading

0 comments on commit 55816c8

Please sign in to comment.