Implement some floating point sse1/sse2 instructions (#57)

This commit is contained in:
Fabian 2018-07-27 16:24:54 -06:00
parent 6f28d8b9c9
commit 7e574dde52
8 changed files with 324 additions and 38 deletions

View file

@ -494,7 +494,10 @@ const encodings = [
{ sse: 1, opcode: 0x660F28, e: 1 },
{ sse: 1, opcode: 0x0F29, e: 1 },
{ sse: 1, opcode: 0x660F29, e: 1 },
{ sse: 1, opcode: 0x0F2A, skip: 1 },
{ sse: 1, opcode: 0x0F2A, e: 1, },
{ sse: 1, opcode: 0x660F2A, e: 1, },
{ sse: 1, opcode: 0xF20F2A, e: 1, },
{ sse: 1, opcode: 0xF30F2A, e: 1, },
{ sse: 1, opcode: 0x0F2B, only_mem: 1, e: 1 },
{ sse: 1, opcode: 0x660F2B, only_mem: 1, e: 1 },
@ -510,7 +513,11 @@ const encodings = [
{ sse: 1, opcode: 0x660F50, only_reg: 1, e: 1 },
{ sse: 1, opcode: 0x0F51, skip: 1 },
{ sse: 1, opcode: 0x0F52, skip: 1 },
{ sse: 1, opcode: 0x0F53, skip: 1 },
// reciprocal: approximation of 1/x. Skipped because our approximation doesn't match intel's
{ sse: 1, opcode: 0x0F53, e: 1, skip: 1, },
{ sse: 1, opcode: 0xF30F53, e: 1, skip: 1, },
{ sse: 1, opcode: 0x0F54, e: 1 },
{ sse: 1, opcode: 0x660F54, e: 1 },
{ sse: 1, opcode: 0x0F55, e: 1 },
@ -520,14 +527,26 @@ const encodings = [
{ sse: 1, opcode: 0x0F57, e: 1 },
{ sse: 1, opcode: 0x660F57, e: 1 },
{ sse: 1, opcode: 0x0F58, skip: 1 },
{ sse: 1, opcode: 0x0F59, skip: 1 },
{ sse: 1, opcode: 0x0F58, e: 1, },
{ sse: 1, opcode: 0x660F58, e: 1, },
{ sse: 1, opcode: 0xF20F58, e: 1, },
{ sse: 1, opcode: 0xF30F58, e: 1, },
{ sse: 1, opcode: 0x0F59, e: 1, },
{ sse: 1, opcode: 0x660F59, e: 1, },
{ sse: 1, opcode: 0xF20F59, e: 1, },
{ sse: 1, opcode: 0xF30F59, e: 1, },
{ sse: 1, opcode: 0x0F5A, skip: 1 },
{ sse: 1, opcode: 0x0F5B, skip: 1 },
{ sse: 1, opcode: 0x0F5C, skip: 1 },
{ sse: 1, opcode: 0x0F5D, skip: 1 },
{ sse: 1, opcode: 0x0F5E, skip: 1 },
{ sse: 1, opcode: 0x0F5F, skip: 1 },
{ sse: 1, opcode: 0x0F5C, e: 1, },
{ sse: 1, opcode: 0x660F5C, e: 1, },
{ sse: 1, opcode: 0xF20F5C, e: 1, },
{ sse: 1, opcode: 0xF30F5C, e: 1, },
{ sse: 1, opcode: 0x0F5D, skip: 1, },
{ sse: 1, opcode: 0x0F5E, skip: 1, },
{ sse: 1, opcode: 0x0F5F, skip: 1, },
{ sse: 1, opcode: 0x660F60, e: 1 },
{ sse: 1, opcode: 0x0F60, e: 1 },
@ -614,7 +633,10 @@ const encodings = [
{ sse: 1, opcode: 0x660F7F, e: 1 },
{ sse: 1, opcode: 0xF30F7F, e: 1 },
{ sse: 1, opcode: 0x0FC2, skip: 1, },
{ sse: 1, opcode: 0x0FC2, e: 1, imm8: 1 },
{ sse: 1, opcode: 0x660FC2, e: 1, imm8: 1 },
{ sse: 1, opcode: 0xF20FC2, e: 1, imm8: 1 },
{ sse: 1, opcode: 0xF30FC2, e: 1, imm8: 1 },
{ opcode: 0x0FC3, e: 1, only_mem: 1, }, // movnti: Uses normal registers, hence not marked as sse

View file

@ -1368,6 +1368,16 @@ void write_mmx_reg64(int32_t r, union reg64 data)
reg_mmx[r].u64[0] = data.u64[0];
}
float_t read_xmm_f32(int32_t r)
{
return reg_xmm[r].f32[0];
}
int32_t read_xmm32(int32_t r)
{
return reg_xmm[r].u32[0];
}
union reg64 read_xmm64s(int32_t r)
{
union reg64 x;
@ -1380,6 +1390,16 @@ union reg128 read_xmm128s(int32_t r)
return reg_xmm[r];
}
void write_xmm_f32(int32_t r, float_t data)
{
reg_xmm[r].f32[0] = data;
}
void write_xmm32(int32_t r, int32_t data)
{
reg_xmm[r].i32[0] = data;
}
void write_xmm64(int32_t r, union reg64 data)
{
reg_xmm[r].u64[0] = data.u64[0];

View file

@ -1,6 +1,7 @@
#pragma once
#include <assert.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
@ -17,6 +18,8 @@ union reg128 {
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
float_t f32[4];
double_t f64[2];
};
_Static_assert(sizeof(union reg128) == 16, "reg128 is 16 bytes");
@ -29,6 +32,7 @@ union reg64 {
uint16_t u16[4];
uint32_t u32[2];
uint64_t u64[1];
float_t f32[2];
double f64[1];
};
_Static_assert(sizeof(union reg64) == 8, "reg64 is 8 bytes");
@ -141,8 +145,12 @@ int32_t read_mmx32s(int32_t r);
union reg64 read_mmx64s(int32_t r);
void write_mmx64(int32_t r, int32_t low, int32_t high);
void write_mmx_reg64(int32_t r, union reg64 data);
float_t read_xmm_f32(int32_t r);
int32_t read_xmm32(int32_t r);
union reg64 read_xmm64s(int32_t r);
union reg128 read_xmm128s(int32_t r);
void write_xmm_f32(int32_t r, float_t data);
void write_xmm32(int32_t r, int32_t);
void write_xmm64(int32_t r, union reg64 data);
void write_xmm128(int32_t r, int32_t i0, int32_t i1, int32_t i2, int32_t i3);
void write_xmm_reg128(int32_t r, union reg128 data);

View file

@ -621,7 +621,48 @@ void instr_660F29_reg(int32_t r1, int32_t r2) {
mov_r_r128(r1, r2);
}
void instr_0F2A() { unimplemented_sse(); }
void instr_0F2A(union reg64 source, int32_t r) {
// cvtpi2ps xmm, mm/m64
// XXX: The non-memory variant causes a transition from x87 FPU to MMX technology operation
union reg64 result = {
.f32 = {
// Note: Casts here can fail
source.i32[0],
source.i32[1],
}
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT(instr_0F2A, safe_read64s, read_mmx64s)
void instr_660F2A(union reg64 source, int32_t r) {
// cvtpi2pd xmm, xmm/m64
// XXX: The non-memory variant causes a transition from x87 FPU to MMX technology operation
union reg128 result = {
.f64 = {
// These casts can't fail
source.i32[0],
source.i32[1],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_660F2A, safe_read64s, read_mmx64s)
void instr_F20F2A(int32_t source, int32_t r) {
// cvtsi2sd xmm, r32/m32
union reg64 result = {
// This cast can't fail
.f64 = { source }
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT(instr_F20F2A, safe_read32s, read_reg32)
void instr_F30F2A(int32_t source, int32_t r) {
// cvtsi2ss xmm, r/m32
// Note: This cast can fail
float_t result = source;
write_xmm_f32(r, result);
}
DEFINE_SSE_SPLIT(instr_F30F2A, safe_read32s, read_reg32)
void instr_0F2B_reg(int32_t r1, int32_t r2) { trigger_ud(); }
void instr_0F2B_mem(int32_t addr, int32_t r) {
@ -1005,7 +1046,26 @@ void instr_660F50_mem(int32_t addr, int32_t r1) { trigger_ud(); }
void instr_0F51() { unimplemented_sse(); }
void instr_0F52() { unimplemented_sse(); }
void instr_0F53() { unimplemented_sse(); }
void instr_0F53(union reg128 source, int32_t r) {
// rcpps xmm, xmm/m128
union reg128 result = {
.f32 = {
1 / source.f32[0],
1 / source.f32[1],
1 / source.f32[2],
1 / source.f32[3],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_0F53, safe_read128s, read_xmm128s)
void instr_F30F53(float_t source, int32_t r) {
// rcpss xmm, xmm/m32
write_xmm_f32(r, 1 / source);
}
DEFINE_SSE_SPLIT(instr_F30F53, fpu_load_m32, read_xmm_f32)
void instr_0F54(union reg128 source, int32_t r) {
// andps xmm, xmm/mem128
@ -1063,11 +1123,138 @@ void instr_660F57(union reg128 source, int32_t r) {
}
DEFINE_SSE_SPLIT(instr_660F57, safe_read128s, read_xmm128s)
void instr_0F58() { unimplemented_sse(); }
void instr_0F59() { unimplemented_sse(); }
void instr_0F58(union reg128 source, int32_t r) {
// addps xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f32 = {
source.f32[0] + destination.f32[0],
source.f32[1] + destination.f32[1],
source.f32[2] + destination.f32[2],
source.f32[3] + destination.f32[3],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_0F58, safe_read128s, read_xmm128s)
void instr_660F58(union reg128 source, int32_t r) {
// addpd xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f64 = {
source.f64[0] + destination.f64[0],
source.f64[1] + destination.f64[1],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_660F58, safe_read128s, read_xmm128s)
void instr_F20F58(union reg64 source, int32_t r) {
// addsd xmm, xmm/mem64
union reg64 destination = read_xmm64s(r);
union reg64 result = {
.f64 = { source.f64[0] + destination.f64[0], }
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT(instr_F20F58, safe_read64s, read_xmm64s)
void instr_F30F58(float_t source, int32_t r) {
// addss xmm, xmm/mem32
float_t destination = read_xmm_f32(r);
float result = source + destination;
write_xmm_f32(r, result);
}
DEFINE_SSE_SPLIT(instr_F30F58, fpu_load_m32, read_xmm_f32)
void instr_0F59(union reg128 source, int32_t r) {
// mulps xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f32 = {
source.f32[0] * destination.f32[0],
source.f32[1] * destination.f32[1],
source.f32[2] * destination.f32[2],
source.f32[3] * destination.f32[3],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_0F59, safe_read128s, read_xmm128s)
void instr_660F59(union reg128 source, int32_t r) {
// mulpd xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f64 = {
source.f64[0] * destination.f64[0],
source.f64[1] * destination.f64[1],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_660F59, safe_read128s, read_xmm128s)
void instr_F20F59(union reg64 source, int32_t r) {
// mulsd xmm, xmm/mem64
union reg64 destination = read_xmm64s(r);
union reg64 result = {
.f64 = { source.f64[0] * destination.f64[0], }
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT(instr_F20F59, safe_read64s, read_xmm64s)
void instr_F30F59(float_t source, int32_t r) {
// mulss xmm, xmm/mem32
float_t destination = read_xmm_f32(r);
float result = source * destination;
write_xmm_f32(r, result);
}
DEFINE_SSE_SPLIT(instr_F30F59, fpu_load_m32, read_xmm_f32)
void instr_0F5A() { unimplemented_sse(); }
void instr_0F5B() { unimplemented_sse(); }
void instr_0F5C() { unimplemented_sse(); }
void instr_0F5C(union reg128 source, int32_t r) {
// subps xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f32 = {
destination.f32[0] - source.f32[0],
destination.f32[1] - source.f32[1],
destination.f32[2] - source.f32[2],
destination.f32[3] - source.f32[3],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_0F5C, safe_read128s, read_xmm128s)
void instr_660F5C(union reg128 source, int32_t r) {
// subpd xmm, xmm/mem128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.f64 = {
destination.f64[0] - source.f64[0],
destination.f64[1] - source.f64[1],
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT(instr_660F5C, safe_read128s, read_xmm128s)
void instr_F20F5C(union reg64 source, int32_t r) {
// subsd xmm, xmm/mem64
union reg64 destination = read_xmm64s(r);
union reg64 result = {
.f64 = { destination.f64[0] - source.f64[0], }
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT(instr_F20F5C, safe_read64s, read_xmm64s)
void instr_F30F5C(float_t source, int32_t r) {
// subss xmm, xmm/mem32
float_t destination = read_xmm_f32(r);
float result = destination - source;
write_xmm_f32(r, result);
}
DEFINE_SSE_SPLIT(instr_F30F5C, fpu_load_m32, read_xmm_f32)
void instr_0F5D() { unimplemented_sse(); }
void instr_0F5E() { unimplemented_sse(); }
void instr_0F5F() { unimplemented_sse(); }
@ -2261,7 +2448,49 @@ DEFINE_MODRM_INSTR_READ_WRITE_8(instr_0FC0, xadd8(___, get_reg8_index(r)))
DEFINE_MODRM_INSTR_READ_WRITE_16(instr16_0FC1, xadd16(___, get_reg16_index(r)))
DEFINE_MODRM_INSTR_READ_WRITE_32(instr32_0FC1, xadd32(___, r))
void instr_0FC2() { unimplemented_sse(); }
void instr_0FC2(union reg128 source, int32_t r, int32_t imm8) {
// cmpps xmm, xmm/m128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.i32 = {
sse_comparison(imm8, destination.f32[0], source.f32[0]) ? -1 : 0,
sse_comparison(imm8, destination.f32[1], source.f32[1]) ? -1 : 0,
sse_comparison(imm8, destination.f32[2], source.f32[2]) ? -1 : 0,
sse_comparison(imm8, destination.f32[3], source.f32[3]) ? -1 : 0,
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT_IMM(instr_0FC2, safe_read128s, read_xmm128s)
void instr_660FC2(union reg128 source, int32_t r, int32_t imm8) {
// cmppd xmm, xmm/m128
union reg128 destination = read_xmm128s(r);
union reg128 result = {
.i64 = {
sse_comparison(imm8, destination.f64[0], source.f64[0]) ? -1 : 0,
sse_comparison(imm8, destination.f64[1], source.f64[1]) ? -1 : 0,
}
};
write_xmm_reg128(r, result);
}
DEFINE_SSE_SPLIT_IMM(instr_660FC2, safe_read128s, read_xmm128s)
void instr_F20FC2(union reg64 source, int32_t r, int32_t imm8) {
// cmpsd xmm, xmm/m64
union reg64 destination = read_xmm64s(r);
union reg64 result = {
.i64 = { sse_comparison(imm8, destination.f64[0], source.f64[0]) ? -1 : 0, }
};
write_xmm64(r, result);
}
DEFINE_SSE_SPLIT_IMM(instr_F20FC2, safe_read64s, read_xmm64s)
void instr_F30FC2(float_t source, int32_t r, int32_t imm8) {
// cmpss xmm, xmm/m32
float_t destination = read_xmm_f32(r);
int32_t result = sse_comparison(imm8, destination, source) ? -1 : 0;
write_xmm32(r, result);
}
DEFINE_SSE_SPLIT_IMM(instr_F30FC2, fpu_load_m32, read_xmm_f32)
void instr_0FC3_reg(int32_t r1, int32_t r2) { trigger_ud(); }
void instr_0FC3_mem(int32_t addr, int32_t r) {

View file

@ -174,7 +174,6 @@ void instr_0F29_mem(int32_t addr, int32_t r);
void instr_0F29_reg(int32_t r1, int32_t r2);
void instr_660F29_mem(int32_t addr, int32_t r);
void instr_660F29_reg(int32_t r1, int32_t r2);
void instr_0F2A(void);
void instr_0F2B_reg(int32_t r1, int32_t r2);
void instr_0F2B_mem(int32_t addr, int32_t r);
void instr_660F2B_reg(int32_t r1, int32_t r2);
@ -275,9 +274,6 @@ void instr_0F50_reg(int32_t r1, int32_t r2);
void instr_0F50_mem(int32_t addr, int32_t r1);
void instr_660F50_reg(int32_t r1, int32_t r2);
void instr_660F50_mem(int32_t addr, int32_t r1);
void instr_0F51(void);
void instr_0F52(void);
void instr_0F53(void);
void instr_0F54(union reg128 source, int32_t r);
void instr_0F54_reg(int32_t r1, int32_t r2);
void instr_0F54_mem(int32_t addr, int32_t r);
@ -302,14 +298,7 @@ void instr_0F57_mem(int32_t addr, int32_t r);
void instr_660F57(union reg128 source, int32_t r);
void instr_660F57_reg(int32_t r1, int32_t r2);
void instr_660F57_mem(int32_t addr, int32_t r);
void instr_0F58(void);
void instr_0F59(void);
void instr_0F5A(void);
void instr_0F5B(void);
void instr_0F5C(void);
void instr_0F5D(void);
void instr_0F5E(void);
void instr_0F5F(void);
void instr_0F60(int32_t source, int32_t r);
void instr_0F60_reg(int32_t r1, int32_t r2);
void instr_0F60_mem(int32_t addr, int32_t r);
@ -699,7 +688,6 @@ void instr16_0FC1_mem(int32_t addr, int32_t r);
void instr16_0FC1_reg(int32_t r1, int32_t r);
void instr32_0FC1_mem(int32_t addr, int32_t r);
void instr32_0FC1_reg(int32_t r1, int32_t r);
void instr_0FC2(void);
void instr_0FC3_reg(int32_t r1, int32_t r2);
void instr_0FC3_mem(int32_t addr, int32_t r);
void instr_0FC4(int32_t source, int32_t r, int32_t imm8);

View file

@ -398,3 +398,22 @@ void psllq_r128(int32_t r, uint32_t shift)
write_xmm_reg128(r, result);
}
bool sse_comparison(int32_t op, double_t x, double_t y)
{
// TODO: Signaling
switch(op & 7)
{
case 0: return x == y;
case 1: return x < y;
case 2: return x <= y;
case 3: return isnan(x) || isnan(y);
case 4: return x != y || isnan(x) || isnan(y);
case 5: return x >= y || isnan(x) || isnan(y);
case 6: return x > y || isnan(x) || isnan(y);
case 7: return !isnan(x) && !isnan(y);
}
assert(false);
}

View file

@ -31,3 +31,5 @@ void psrad_r128(int32_t r, uint32_t shift);
void pslld_r128(int32_t r, uint32_t shift);
void psrlq_r128(int32_t r, uint32_t shift);
void psllq_r128(int32_t r, uint32_t shift);
bool sse_comparison(int32_t op, double_t x, double_t y);

View file

@ -2791,7 +2791,6 @@ void test_sse(void)
MOVMSK(movmskpd);
/* FPU specific ops */
/*
{
uint32_t mxcsr;
asm volatile("stmxcsr %0" : "=m" (mxcsr));
@ -2822,10 +2821,10 @@ void test_sse(void)
SSE_OPS(add);
SSE_OPS(mul);
SSE_OPS(sub);
SSE_OPS(min);
SSE_OPS(div);
SSE_OPS(max);
SSE_OPS(sqrt);
//SSE_OPS(min);
//SSE_OPS(div);
//SSE_OPS(max);
//SSE_OPS(sqrt);
SSE_OPS(cmpeq);
SSE_OPS(cmplt);
SSE_OPS(cmple);
@ -2847,10 +2846,10 @@ void test_sse(void)
SSE_OPD(add);
SSE_OPD(mul);
SSE_OPD(sub);
SSE_OPD(min);
SSE_OPD(div);
SSE_OPD(max);
SSE_OPD(sqrt);
//SSE_OPD(min);
//SSE_OPD(div);
//SSE_OPD(max);
//SSE_OPD(sqrt);
SSE_OPD(cmpeq);
SSE_OPD(cmplt);
SSE_OPD(cmple);
@ -2860,7 +2859,6 @@ void test_sse(void)
SSE_OPD(cmpnle);
SSE_OPD(cmpord);
}
*/
/* float to float/int */
/*