Use softfloat f80 for x87 fpu

This fixes several long-standing issues with x87 float emulation, in particular: - 80 bit precision floats, fixing Haiku after its switch to musl libc (hrev53728) - the precision bit in the x87 control word - fucom and fucomi (unordered comparisons) - aliasing of x87 and mmx registers - rounding during conversion to integers Operations that are not implemented in softfloat were implemented by converting to f64 (sine, pow, ln, etc.) and thus operate with lower precision. Softfloat has been combined into a single file using a script [0] and checked into the repository. [0] 57df21e2eb/contrib/single_file_libs/combine.sh
2020-12-31 19:14:32 -06:00 · 2020-12-31 19:14:32 -06:00 · b96f984963
commit b96f984963
parent 1d22abda38
16 changed files with 33343 additions and 670 deletions
--- a/12
+++ b/12
@ -71,6 +71,7 @@ CARGO_FLAGS=\
 		-- \
 		-C linker=tools/rust-lld-wrapper \
 		-C link-args="--import-table --global-base=262144 $(STRIP_DEBUG_FLAG)" \
+		-C link-args="build/softfloat.o" \
 		--verbose

 CORE_FILES=const.js config.js io.js main.js lib.js ide.js pci.js floppy.js \
@ -154,20 +155,26 @@ src/rust/gen/analyzer.rs: $(ANALYZER_DEPENDENCIES)
 src/rust/gen/analyzer0f.rs: $(ANALYZER_DEPENDENCIES)
 	./gen/generate_analyzer.js --output-dir build/ --table analyzer0f

-build/v86.wasm: $(RUST_FILES) Cargo.toml
+build/v86.wasm: $(RUST_FILES) build/softfloat.o Cargo.toml
 	mkdir -p build/
 	-ls -lh build/v86.wasm
 	cargo +nightly rustc --release $(CARGO_FLAGS)
 	mv build/wasm32-unknown-unknown/release/v86.wasm build/v86.wasm
 	ls -lh build/v86.wasm

-build/v86-debug.wasm: $(RUST_FILES) Cargo.toml
+build/v86-debug.wasm: $(RUST_FILES) build/softfloat.o Cargo.toml
 	mkdir -p build/
 	-ls -lh build/v86-debug.wasm
 	cargo +nightly rustc $(CARGO_FLAGS)
 	mv build/wasm32-unknown-unknown/debug/v86.wasm build/v86-debug.wasm
 	ls -lh build/v86-debug.wasm

+build/softfloat.o: lib/softfloat/softfloat.c
+	clang -c \
+	    --target=wasm32 -Os -flto -nostdlib -fvisibility=hidden -ffunction-sections -fdata-sections \
+	    -DSOFTFLOAT_FAST_INT64 -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32 \
+	    -O2 -o build/softfloat.o lib/softfloat/softfloat.c
+
 clean:
 	-rm build/libv86.js
 	-rm build/libv86-debug.js
@ -177,6 +184,7 @@ clean:
 	-rm $(INSTRUCTION_TABLES)
 	-rm build/*.map
 	-rm build/*.wast
+	-rm build/*.o
 	$(MAKE) -C $(NASM_TEST_DIR) clean

 run:
--- a/lib/softfloat/softfloat.c
+++ b/lib/softfloat/softfloat.c
--- a/src/cpu.js
+++ b/src/cpu.js
@ -137,17 +137,16 @@ function CPU(bus, wm)
    this.reg8s = v86util.view(Int8Array, memory, 64, 32);
    this.reg8 = v86util.view(Uint8Array, memory, 64, 32);

-    // Why no Float80Array :-(
-    this.fpu_st = v86util.view(Float64Array, memory, 968, 8);
+    this.fpu_st = v86util.view(Int32Array, memory, 1152, 4 * 8);

-    this.fpu_stack_empty = v86util.view(Int32Array, memory, 816, 1);
+    this.fpu_stack_empty = v86util.view(Uint8Array, memory, 816, 1);
    this.fpu_stack_empty[0] = 0xFF;
-    this.fpu_stack_ptr = v86util.view(Uint32Array, memory, 1032, 1);
+    this.fpu_stack_ptr = v86util.view(Uint8Array, memory, 1032, 1);
    this.fpu_stack_ptr[0] = 0;

-    this.fpu_control_word = v86util.view(Int32Array, memory, 1036, 1);
+    this.fpu_control_word = v86util.view(Uint16Array, memory, 1036, 1);
    this.fpu_control_word[0] = 0x37F;
-    this.fpu_status_word = v86util.view(Int32Array, memory, 1040, 1);
+    this.fpu_status_word = v86util.view(Uint16Array, memory, 1040, 1);
    this.fpu_status_word[0] = 0;
    this.fpu_ip = v86util.view(Int32Array, memory, 1048, 1);
    this.fpu_ip[0] = 0;
@ -160,14 +159,6 @@ function CPU(bus, wm)
    this.fpu_dp_selector = v86util.view(Int32Array, memory, 1060, 1);
    this.fpu_dp_selector[0] = 0;

-    // mm0-mm7 split up into 32 bit pairs
-    this.reg_mmxs = v86util.view(Int32Array, memory, 1064, 16);
-    this.reg_mmx = v86util.view(Uint32Array, this.reg_mmxs.buffer, 1064, 16);
-    this.reg_mmx8s = v86util.view(Int8Array, this.reg_mmxs.buffer, 1064, 64);
-    this.reg_mmx8 = v86util.view(Uint8Array, this.reg_mmxs.buffer, 1064, 64);
-
-    this.fxsave_store_fpu_mask = v86util.view(Uint8Array, memory, 1132, 1);
-
    this.reg_xmm32s = v86util.view(Int32Array, memory, 832, 8 * 4);

    this.mxcsr = v86util.view(Int32Array, memory, 824, 1);
@ -297,6 +288,8 @@ CPU.prototype.wasm_patch = function(wm)
    this.set_tsc = get_import("set_tsc");
    this.store_current_tsc = get_import("store_current_tsc");

+    this.fpu_get_sti_f64 = get_import("fpu_get_sti_f64");
+
    if(DEBUG)
    {
        this.jit_force_generate_unsafe = get_optional_import("jit_force_generate_unsafe");
@ -402,7 +395,6 @@ CPU.prototype.get_state = function()

    state[64] = this.tss_size_32[0];

-    state[65] = this.reg_mmxs;
    state[66] = this.reg_xmm32s;

    state[67] = this.fpu_st;
@ -495,7 +487,6 @@ CPU.prototype.set_state = function(state)

    this.tss_size_32[0] = state[64];

-    this.reg_mmxs.set(state[65]);
    this.reg_xmm32s.set(state[66]);

    this.fpu_st.set(state[67]);
@ -641,7 +632,6 @@ CPU.prototype.reset = function()
    this.fpu_dp[0] = 0;
    this.fpu_dp_selector[0] = 0;

-    this.reg_mmxs.fill(0);
    this.reg_xmm32s.fill(0);

    this.mxcsr[0] = 0x1F80;
--- a/src/rust/codegen.rs
+++ b/src/rust/codegen.rs
@ -1587,19 +1587,68 @@ pub fn gen_test_jcxz(ctx: &mut JitContext, is_asize_32: bool) {
 }

 pub fn gen_fpu_get_sti(ctx: &mut JitContext, i: u32) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
    ctx.builder.const_i32(i as i32);
-    ctx.builder.call_fn1_ret_f64("fpu_get_sti");
+    ctx.builder.call_fn2("fpu_get_sti");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
 }

 pub fn gen_fpu_load_m32(ctx: &mut JitContext, modrm_byte: ModrmByte) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
    gen_modrm_resolve_safe_read32(ctx, modrm_byte);
-    ctx.builder.reinterpret_i32_as_f32();
-    ctx.builder.promote_f32_to_f64();
+    ctx.builder.call_fn2("f32_to_f80");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
 }

 pub fn gen_fpu_load_m64(ctx: &mut JitContext, modrm_byte: ModrmByte) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
    gen_modrm_resolve_safe_read64(ctx, modrm_byte);
-    ctx.builder.reinterpret_i64_as_f64();
+    ctx.builder.call_fn2_i32_i64("f64_to_f80");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
+}
+
+pub fn gen_fpu_load_i16(ctx: &mut JitContext, modrm_byte: ModrmByte) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
+    gen_modrm_resolve_safe_read16(ctx, modrm_byte);
+    sign_extend_i16(ctx.builder);
+    ctx.builder.call_fn2("i32_to_f80");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
+}
+pub fn gen_fpu_load_i32(ctx: &mut JitContext, modrm_byte: ModrmByte) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
+    gen_modrm_resolve_safe_read32(ctx, modrm_byte);
+    ctx.builder.call_fn2("i32_to_f80");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
+}
+pub fn gen_fpu_load_i64(ctx: &mut JitContext, modrm_byte: ModrmByte) {
+    ctx.builder
+        .const_i32(global_pointers::sse_scratch_register as i32);
+    gen_modrm_resolve_safe_read64(ctx, modrm_byte);
+    ctx.builder.call_fn2_i32_i64("i64_to_f80");
+    ctx.builder
+        .load_fixed_i64(global_pointers::sse_scratch_register as u32);
+    ctx.builder
+        .load_fixed_u16(global_pointers::sse_scratch_register as u32 + 8);
 }

 pub fn gen_trigger_de(ctx: &mut JitContext) {
--- a/src/rust/cpu/cpu.rs
+++ b/src/rust/cpu/cpu.rs
@ -2672,6 +2672,10 @@ pub unsafe fn safe_read32s(addr: i32) -> OrPageFault<i32> {
    }
 }

+pub unsafe fn safe_read_f32(addr: i32) -> OrPageFault<f32> {
+    Ok(std::mem::transmute(safe_read32s(addr)?))
+}
+
 pub unsafe fn safe_read64s(addr: i32) -> OrPageFault<u64> {
    if addr & 0xFFF > 0x1000 - 8 {
        Ok(safe_read32s(addr)? as u32 as u64 | (safe_read32s(addr + 4)? as u32 as u64) << 32)
@ -3121,14 +3125,11 @@ pub unsafe fn write_reg32(index: i32, value: i32) {
    *reg32.offset(index as isize) = value;
 }

-pub unsafe fn read_mmx32s(r: i32) -> i32 { *reg_mmx.offset(r as isize) as i32 }
+pub unsafe fn read_mmx32s(r: i32) -> i32 { (*fpu_st.offset(r as isize)).mantissa as i32 }

-pub unsafe fn read_mmx64s(r: i32) -> u64 { *reg_mmx.offset(r as isize) }
+pub unsafe fn read_mmx64s(r: i32) -> u64 { (*fpu_st.offset(r as isize)).mantissa }

-pub unsafe fn write_mmx_reg64(r: i32, data: u64) {
-    *fxsave_store_fpu_mask &= !(1 << r);
-    *reg_mmx.offset(r as isize) = data;
-}
+pub unsafe fn write_mmx_reg64(r: i32, data: u64) { (*fpu_st.offset(r as isize)).mantissa = data; }

 pub unsafe fn read_xmm_f32(r: i32) -> f32 { return (*reg_xmm.offset(r as isize)).f32_0[0]; }

--- a/src/rust/cpu/fpu.rs
+++ b/src/rust/cpu/fpu.rs
--- a/src/rust/cpu/global_pointers.rs
+++ b/src/rust/cpu/global_pointers.rs
@ -1,6 +1,7 @@
 #![allow(non_upper_case_globals)]

 use cpu::cpu::reg128;
+use softfloat::F80;

 pub const reg8: *mut u8 = 64 as *mut u8;
 pub const reg16: *mut u16 = 64 as *mut u16;
@ -44,27 +45,26 @@ pub const protected_mode: *mut bool = 800 as *mut bool;
 pub const is_32: *mut bool = 804 as *mut bool;
 pub const stack_size_32: *mut bool = 808 as *mut bool;
 pub const memory_size: *mut u32 = 812 as *mut u32;
-pub const fpu_stack_empty: *mut i32 = 816 as *mut i32;
+pub const fpu_stack_empty: *mut u8 = 816 as *mut u8;
 pub const mxcsr: *mut i32 = 824 as *mut i32;
-// gap
+
 pub const reg_xmm: *mut reg128 = 832 as *mut reg128;
 pub const current_tsc: *mut u64 = 960 as *mut u64;
-pub const fpu_st: *mut f64 = 968 as *mut f64;
-pub const fpu_st8: *mut u8 = 968 as *mut u8;
-pub const fpu_st32: *mut i32 = 968 as *mut i32;
-pub const fpu_stack_ptr: *mut u32 = 1032 as *mut u32;
-pub const fpu_control_word: *mut i32 = 1036 as *mut i32;
-pub const fpu_status_word: *mut i32 = 1040 as *mut i32;
+
+pub const fpu_stack_ptr: *mut u8 = 1032 as *mut u8;
+pub const fpu_control_word: *mut u16 = 1036 as *mut u16;
+pub const fpu_status_word: *mut u16 = 1040 as *mut u16;
 pub const fpu_opcode: *mut i32 = 1044 as *mut i32;
 pub const fpu_ip: *mut i32 = 1048 as *mut i32;
 pub const fpu_ip_selector: *mut i32 = 1052 as *mut i32;
 pub const fpu_dp: *mut i32 = 1056 as *mut i32;
 pub const fpu_dp_selector: *mut i32 = 1060 as *mut i32;
-pub const reg_mmx: *mut u64 = 1064 as *mut u64;
 pub const tss_size_32: *mut bool = 1128 as *mut bool;
-pub const fxsave_store_fpu_mask: *mut u8 = 1132 as *mut u8;
+
 pub const sse_scratch_register: *mut reg128 = 1136 as *mut reg128;

+pub const fpu_st: *mut F80 = 1152 as *mut F80;
+
 pub const opstats_buffer: *mut u32 = 0x08000 as *mut u32;
 pub const opstats_compiled_buffer: *mut u32 = 0x10000 as *mut u32;
 pub const opstats_jit_exit_buffer: *mut u32 = 0x18000 as *mut u32;
@ -78,7 +78,7 @@ pub fn get_reg32_offset(r: u32) -> u32 {

 pub fn get_reg_mmx_offset(r: u32) -> u32 {
    dbg_assert!(r < 8);
-    (unsafe { reg_mmx.offset(r as isize) }) as u32
+    (unsafe { fpu_st.offset(r as isize) }) as u32
 }

 pub fn get_reg_xmm_offset(r: u32) -> u32 {
--- a/src/rust/cpu/instructions.rs
+++ b/src/rust/cpu/instructions.rs
@ -12,6 +12,7 @@ use cpu::global_pointers::*;
 use cpu::misc_instr::*;
 use cpu::misc_instr::{pop16, pop32s, push16, push32};
 use cpu::string::*;
+use softfloat::F80;

 pub unsafe fn instr_00_mem(addr: i32, r: i32) {
    SAFE_READ_WRITE8!(___, addr, add8(___, read_reg8(r)));
@ -2416,13 +2417,13 @@ pub unsafe fn instr16_D9_5_mem(addr: i32) { fpu_fldcw(addr); }
 pub unsafe fn instr16_D9_5_reg(r: i32) {
    // fld1/fldl2t/fldl2e/fldpi/fldlg2/fldln2/fldz
    match r {
-        0 => fpu_push(1.0),
-        1 => fpu_push(std::f64::consts::LN_10 / std::f64::consts::LN_2),
-        2 => fpu_push(std::f64::consts::LOG2_E),
-        3 => fpu_push(std::f64::consts::PI),
-        4 => fpu_push(std::f64::consts::LN_2 / std::f64::consts::LN_10),
-        5 => fpu_push(std::f64::consts::LN_2),
-        6 => fpu_push(0.0),
+        0 => fpu_push(F80::ONE),
+        1 => fpu_push(F80::LN_10 / F80::LN_2),
+        2 => fpu_push(F80::LOG2_E),
+        3 => fpu_push(F80::PI),
+        4 => fpu_push(F80::LN_2 / F80::LN_10),
+        5 => fpu_push(F80::LN_2),
+        6 => fpu_push(F80::ZERO),
        7 => {
            dbg_log!("d9/5/7");
            trigger_ud();
@ -2483,36 +2484,21 @@ pub unsafe fn instr32_D9_5_mem(r: i32) { instr16_D9_5_mem(r) }
 pub unsafe fn instr32_D9_7_mem(r: i32) { instr16_D9_7_mem(r) }

 #[no_mangle]
-pub unsafe fn instr_DA_0_mem(addr: i32) {
-    fpu_fadd(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_0_mem(addr: i32) { fpu_fadd(0, return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_1_mem(addr: i32) {
-    fpu_fmul(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_1_mem(addr: i32) { fpu_fmul(0, return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_2_mem(addr: i32) {
-    fpu_fcom(return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_2_mem(addr: i32) { fpu_fcom(return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_3_mem(addr: i32) {
-    fpu_fcomp(return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_3_mem(addr: i32) { fpu_fcomp(return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_4_mem(addr: i32) {
-    fpu_fsub(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
-pub unsafe fn instr_DA_5_mem(addr: i32) {
-    fpu_fsubr(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_4_mem(addr: i32) { fpu_fsub(0, return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_6_mem(addr: i32) {
-    fpu_fdiv(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_5_mem(addr: i32) { fpu_fsubr(0, return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
-pub unsafe fn instr_DA_7_mem(addr: i32) {
-    fpu_fdivr(0, return_on_pagefault!(safe_read32s(addr)) as f64);
-}
+pub unsafe fn instr_DA_6_mem(addr: i32) { fpu_fdiv(0, return_on_pagefault!(fpu_load_i32(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DA_7_mem(addr: i32) { fpu_fdivr(0, return_on_pagefault!(fpu_load_i32(addr))); }
 #[no_mangle]
 pub unsafe fn instr_DA_0_reg(r: i32) { fpu_fcmovcc(test_b(), r); }
 #[no_mangle]
@ -2651,30 +2637,24 @@ pub unsafe fn instr32_DD_5_mem(r: i32) { instr16_DD_5_mem(r) }
 #[no_mangle]
 pub unsafe fn instr32_DD_7_mem(r: i32) { instr16_DD_7_mem(r) }

-pub unsafe fn instr_DE_0_mem(addr: i32) {
-    fpu_fadd(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_1_mem(addr: i32) {
-    fpu_fmul(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_2_mem(addr: i32) {
-    fpu_fcom(return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_3_mem(addr: i32) {
-    fpu_fcomp(return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_4_mem(addr: i32) {
-    fpu_fsub(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_5_mem(addr: i32) {
-    fpu_fsubr(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_6_mem(addr: i32) {
-    fpu_fdiv(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
-pub unsafe fn instr_DE_7_mem(addr: i32) {
-    fpu_fdivr(0, return_on_pagefault!(safe_read16(addr)) as i16 as f64);
-}
+#[no_mangle]
+pub unsafe fn instr_DE_0_mem(addr: i32) { fpu_fadd(0, return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_1_mem(addr: i32) { fpu_fmul(0, return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_2_mem(addr: i32) { fpu_fcom(return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_3_mem(addr: i32) { fpu_fcomp(return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_4_mem(addr: i32) { fpu_fsub(0, return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_5_mem(addr: i32) { fpu_fsubr(0, return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_6_mem(addr: i32) { fpu_fdiv(0, return_on_pagefault!(fpu_load_i16(addr))); }
+#[no_mangle]
+pub unsafe fn instr_DE_7_mem(addr: i32) { fpu_fdivr(0, return_on_pagefault!(fpu_load_i16(addr))); }
+
+#[no_mangle]
 pub unsafe fn instr_DE_0_reg(r: i32) {
    fpu_fadd(r, fpu_get_sti(r));
    fpu_pop();
--- a/src/rust/cpu/instructions_0f.rs
+++ b/src/rust/cpu/instructions_0f.rs
@ -38,7 +38,6 @@ use cpu::arith::{
    saturate_sw_to_sb, saturate_sw_to_ub, saturate_ud_to_ub, saturate_uw,
 };
 use cpu::cpu::*;
-use cpu::fpu::fpu_load_m32;
 use cpu::fpu::fpu_set_tag_word;
 use cpu::global_pointers::*;
 use cpu::misc_instr::{
@ -1009,7 +1008,7 @@ pub unsafe fn instr_F30F2C(source: f32, r: i32) {
 }
 #[no_mangle]
 pub unsafe fn instr_F30F2C_mem(addr: i32, r: i32) {
-    instr_F30F2C(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F2C(return_on_pagefault!(safe_read_f32(addr)), r);
 }
 #[no_mangle]
 pub unsafe fn instr_F30F2C_reg(r1: i32, r2: i32) { instr_F30F2C(read_xmm_f32(r1), r2); }
@ -1037,7 +1036,7 @@ pub unsafe fn instr_0F2E(source: f32, r: i32) {
 pub unsafe fn instr_0F2E_reg(r1: i32, r2: i32) { instr_0F2E(read_xmm_f32(r1), r2) }
 #[no_mangle]
 pub unsafe fn instr_0F2E_mem(addr: i32, r: i32) {
-    instr_0F2E(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_0F2E(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_660F2E(source: u64, r: i32) {
@ -1090,7 +1089,7 @@ pub unsafe fn instr_0F2F(source: f32, r: i32) {
 pub unsafe fn instr_0F2F_reg(r1: i32, r2: i32) { instr_0F2F(read_xmm_f32(r1), r2) }
 #[no_mangle]
 pub unsafe fn instr_0F2F_mem(addr: i32, r: i32) {
-    instr_0F2F(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_0F2F(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_660F2F(source: u64, r: i32) {
@ -4520,7 +4519,7 @@ pub unsafe fn instr_F30F2D(source: f32, r: i32) {
 pub unsafe fn instr_F30F2D_reg(r1: i32, r2: i32) { instr_F30F2D(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F2D_mem(addr: i32, r: i32) {
-    instr_F30F2D(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F2D(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F51(source: reg128, r: i32) {
@ -4576,7 +4575,7 @@ pub unsafe fn instr_F30F51(source: f32, r: i32) {
 pub unsafe fn instr_F30F51_reg(r1: i32, r2: i32) { instr_F30F51(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F51_mem(addr: i32, r: i32) {
-    instr_F30F51(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F51(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F52(source: reg128, r: i32) {
@ -4605,7 +4604,7 @@ pub unsafe fn instr_F30F52(source: f32, r: i32) {
 pub unsafe fn instr_F30F52_reg(r1: i32, r2: i32) { instr_F30F52(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F52_mem(addr: i32, r: i32) {
-    instr_F30F52(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F52(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F53(source: reg128, r: i32) {
@ -4634,7 +4633,7 @@ pub unsafe fn instr_F30F53(source: f32, r: i32) {
 pub unsafe fn instr_F30F53_reg(r1: i32, r2: i32) { instr_F30F53(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F53_mem(addr: i32, r: i32) {
-    instr_F30F53(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F53(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F58(source: reg128, r: i32) {
@ -4694,7 +4693,7 @@ pub unsafe fn instr_F30F58(source: f32, r: i32) {
 pub unsafe fn instr_F30F58_reg(r1: i32, r2: i32) { instr_F30F58(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F58_mem(addr: i32, r: i32) {
-    instr_F30F58(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F58(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F59(source: reg128, r: i32) {
@ -4754,7 +4753,7 @@ pub unsafe fn instr_F30F59(source: f32, r: i32) {
 pub unsafe fn instr_F30F59_reg(r1: i32, r2: i32) { instr_F30F59(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F59_mem(addr: i32, r: i32) {
-    instr_F30F59(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F59(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F5A(source: u64, r: i32) {
@ -4804,7 +4803,7 @@ pub unsafe fn instr_F30F5A(source: f32, r: i32) {
 pub unsafe fn instr_F30F5A_reg(r1: i32, r2: i32) { instr_F30F5A(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F5A_mem(addr: i32, r: i32) {
-    instr_F30F5A(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F5A(return_on_pagefault!(safe_read_f32(addr)), r);
 }

 pub unsafe fn instr_0F5B(source: reg128, r: i32) {
@ -4922,7 +4921,7 @@ pub unsafe fn instr_F30F5C(source: f32, r: i32) {
 pub unsafe fn instr_F30F5C_reg(r1: i32, r2: i32) { instr_F30F5C(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F5C_mem(addr: i32, r: i32) {
-    instr_F30F5C(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F5C(return_on_pagefault!(safe_read_f32(addr)), r);
 }
 pub unsafe fn instr_0F5D(source: reg128, r: i32) {
    // minps xmm, xmm/mem128
@ -4984,7 +4983,7 @@ pub unsafe fn instr_F30F5D(source: f32, r: i32) {
 pub unsafe fn instr_F30F5D_reg(r1: i32, r2: i32) { instr_F30F5D(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F5D_mem(addr: i32, r: i32) {
-    instr_F30F5D(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F5D(return_on_pagefault!(safe_read_f32(addr)), r);
 }
 pub unsafe fn instr_0F5E(source: reg128, r: i32) {
    // divps xmm, xmm/mem128
@ -5043,7 +5042,7 @@ pub unsafe fn instr_F30F5E(source: f32, r: i32) {
 pub unsafe fn instr_F30F5E_reg(r1: i32, r2: i32) { instr_F30F5E(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F5E_mem(addr: i32, r: i32) {
-    instr_F30F5E(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F5E(return_on_pagefault!(safe_read_f32(addr)), r);
 }
 pub unsafe fn instr_0F5F(source: reg128, r: i32) {
    // maxps xmm, xmm/mem128
@ -5105,7 +5104,7 @@ pub unsafe fn instr_F30F5F(source: f32, r: i32) {
 pub unsafe fn instr_F30F5F_reg(r1: i32, r2: i32) { instr_F30F5F(read_xmm_f32(r1), r2); }
 #[no_mangle]
 pub unsafe fn instr_F30F5F_mem(addr: i32, r: i32) {
-    instr_F30F5F(return_on_pagefault!(fpu_load_m32(addr)) as f32, r);
+    instr_F30F5F(return_on_pagefault!(safe_read_f32(addr)), r);
 }
 pub unsafe fn instr_0FC2(source: reg128, r: i32, imm8: i32) {
    // cmpps xmm, xmm/m128
@ -5182,5 +5181,5 @@ pub unsafe fn instr_F30FC2_reg(r1: i32, r2: i32, imm: i32) {
 }
 #[no_mangle]
 pub unsafe fn instr_F30FC2_mem(addr: i32, r: i32, imm: i32) {
-    instr_F30FC2(return_on_pagefault!(fpu_load_m32(addr)) as f32, r, imm);
+    instr_F30FC2(return_on_pagefault!(safe_read_f32(addr)), r, imm);
 }
--- a/src/rust/cpu/misc_instr.rs
+++ b/src/rust/cpu/misc_instr.rs
@ -1,5 +1,7 @@
 use cpu::cpu::*;
-use cpu::fpu::{fpu_load_m80, fpu_load_status_word, fpu_set_status_word, fpu_store_m80};
+use cpu::fpu::{
+    fpu_load_m80, fpu_load_status_word, fpu_set_status_word, fpu_store_m80, set_control_word,
+};
 use cpu::global_pointers::*;
 use paging::OrPageFault;

@ -363,9 +365,9 @@ pub unsafe fn setcc_mem(condition: bool, addr: i32) {
 #[no_mangle]
 pub unsafe fn fxsave(addr: i32) {
    return_on_pagefault!(writable_or_pagefault(addr as i32, 512));
-    safe_write16(addr.wrapping_add(0) as i32, *fpu_control_word).unwrap();
-    safe_write16(addr.wrapping_add(2) as i32, fpu_load_status_word()).unwrap();
-    safe_write8(addr.wrapping_add(4) as i32, !*fpu_stack_empty & 255).unwrap();
+    safe_write16(addr.wrapping_add(0) as i32, (*fpu_control_word).into()).unwrap();
+    safe_write16(addr.wrapping_add(2) as i32, fpu_load_status_word().into()).unwrap();
+    safe_write8(addr.wrapping_add(4) as i32, !*fpu_stack_empty as i32 & 255).unwrap();
    safe_write16(addr.wrapping_add(6) as i32, *fpu_opcode).unwrap();
    safe_write32(addr.wrapping_add(8) as i32, *fpu_ip).unwrap();
    safe_write16(addr.wrapping_add(12) as i32, *fpu_ip_selector).unwrap();
@ -376,13 +378,7 @@ pub unsafe fn fxsave(addr: i32) {

    for i in 0..8 {
        let reg_index = i + *fpu_stack_ptr as i32 & 7;
-        if *fxsave_store_fpu_mask & 1 << reg_index != 0 {
-            fpu_store_m80(addr + 32 + (i << 4), *fpu_st.offset(reg_index as isize));
-        }
-        else {
-            safe_write64(addr + 32 + (i << 4), *reg_mmx.offset(reg_index as isize)).unwrap();
-            safe_write64(addr + 32 + (i << 4) | 8, 0).unwrap();
-        }
+        fpu_store_m80(addr + 32 + (i << 4), *fpu_st.offset(reg_index as isize));
    }

    // If the OSFXSR bit in control register CR4 is not set, the FXSAVE
@ -407,9 +403,9 @@ pub unsafe fn fxrstor(addr: i32) {
        return;
    }
    else {
-        *fpu_control_word = safe_read16(addr.wrapping_add(0) as i32).unwrap();
-        fpu_set_status_word(safe_read16(addr.wrapping_add(2) as i32).unwrap());
-        *fpu_stack_empty = !safe_read8(addr.wrapping_add(4) as i32).unwrap() & 255;
+        set_control_word(safe_read16(addr.wrapping_add(0) as i32).unwrap() as u16);
+        fpu_set_status_word(safe_read16(addr.wrapping_add(2) as i32).unwrap() as u16);
+        *fpu_stack_empty = !safe_read8(addr.wrapping_add(4) as i32).unwrap() as u8;
        *fpu_opcode = safe_read16(addr.wrapping_add(6) as i32).unwrap();
        *fpu_ip = safe_read32s(addr.wrapping_add(8) as i32).unwrap();
        *fpu_ip = safe_read16(addr.wrapping_add(12) as i32).unwrap();
@ -421,14 +417,8 @@ pub unsafe fn fxrstor(addr: i32) {
            let reg_index = *fpu_stack_ptr as i32 + i & 7;
            *fpu_st.offset(reg_index as isize) =
                fpu_load_m80(addr.wrapping_add(32).wrapping_add(i << 4)).unwrap();
-            *reg_mmx.offset(reg_index as isize) =
-                safe_read64s(addr.wrapping_add(32).wrapping_add(i << 4)).unwrap();
        }

-        // Mark values as coming from the fpu: xmm registers fit into x87 registers, but not the
-        // other way around
-        *fxsave_store_fpu_mask = 0xff;
-
        for i in 0..8 {
            (*reg_xmm.offset(i as isize)).u32_0[0] =
                safe_read32s(addr.wrapping_add(160).wrapping_add(i << 4).wrapping_add(0)).unwrap()
--- a/src/rust/jit_instructions.rs
+++ b/src/rust/jit_instructions.rs
@ -2606,12 +2606,12 @@ pub fn instr_D7_jit(ctx: &mut JitContext) {
 fn instr_group_D8_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte, op: &str) {
    ctx.builder.const_i32(0);
    codegen::gen_fpu_load_m32(ctx, modrm_byte);
-    ctx.builder.call_fn2_i32_f64(op)
+    ctx.builder.call_fn3_i32_i64_i32(op)
 }
 fn instr_group_D8_reg_jit(ctx: &mut JitContext, r: u32, op: &str) {
    ctx.builder.const_i32(0);
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn2_i32_f64(op)
+    ctx.builder.call_fn3_i32_i64_i32(op)
 }

 pub fn instr_D8_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
@ -2628,19 +2628,19 @@ pub fn instr_D8_1_reg_jit(ctx: &mut JitContext, r: u32) {
 }
 pub fn instr_D8_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m32(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_fcom")
+    ctx.builder.call_fn2_i64_i32("fpu_fcom")
 }
 pub fn instr_D8_2_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_fcom")
+    ctx.builder.call_fn2_i64_i32("fpu_fcom")
 }
 pub fn instr_D8_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m32(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_fcomp")
+    ctx.builder.call_fn2_i64_i32("fpu_fcomp")
 }
 pub fn instr_D8_3_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_fcomp")
+    ctx.builder.call_fn2_i64_i32("fpu_fcomp")
 }
 pub fn instr_D8_4_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    instr_group_D8_mem_jit(ctx, modrm_byte, "fpu_fsub")
@ -2669,11 +2669,11 @@ pub fn instr_D8_7_reg_jit(ctx: &mut JitContext, r: u32) {

 pub fn instr16_D9_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m32(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_push");
+    ctx.builder.call_fn2_i64_i32("fpu_push");
 }
 pub fn instr16_D9_0_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_push");
+    ctx.builder.call_fn2_i64_i32("fpu_push");
 }
 pub fn instr32_D9_0_reg_jit(ctx: &mut JitContext, r: u32) { instr16_D9_0_reg_jit(ctx, r) }
 pub fn instr32_D9_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
@ -2697,8 +2697,7 @@ pub fn instr16_D9_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.demote_f64_to_f32();
-    ctx.builder.reinterpret_f32_as_i32();
+    ctx.builder.call_fn2_i64_i32_ret("f80_to_f32");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write32(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -2718,13 +2717,12 @@ pub fn instr16_D9_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.demote_f64_to_f32();
-    ctx.builder.reinterpret_f32_as_i32();
+    ctx.builder.call_fn2_i64_i32_ret("f80_to_f32");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write32(ctx, &address_local, &value_local);
-    codegen::gen_fn0_const(ctx.builder, "fpu_pop");
    ctx.builder.free_local(address_local);
    ctx.builder.free_local(value_local);
+    codegen::gen_fn0_const(ctx.builder, "fpu_pop");
 }
 pub fn instr16_D9_3_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fn1_const(ctx.builder, "fpu_fstp", r);
@ -2768,10 +2766,8 @@ pub fn instr32_D9_4_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
 }

 pub fn instr16_D9_5_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
-    ctx.builder
-        .const_i32(global_pointers::fpu_control_word as i32);
    codegen::gen_modrm_resolve_safe_read16(ctx, modrm_byte);
-    ctx.builder.store_aligned_u16(0);
+    ctx.builder.call_fn1("set_control_word");
 }
 pub fn instr16_D9_5_reg_jit(ctx: &mut JitContext, r: u32) {
    if r == 7 {
@ -2834,9 +2830,8 @@ pub fn instr32_D9_7_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {

 pub fn instr_DA_5_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    ctx.builder.const_i32(0);
-    codegen::gen_modrm_resolve_safe_read32(ctx, modrm_byte);
-    ctx.builder.convert_i32_to_f64();
-    ctx.builder.call_fn2_i32_f64("fpu_fsubr")
+    codegen::gen_fpu_load_i32(ctx, modrm_byte);
+    ctx.builder.call_fn3_i32_i64_i32("fpu_fsubr")
 }
 pub fn instr_DA_5_reg_jit(ctx: &mut JitContext, r: u32) {
    if r == 1 {
@ -2848,9 +2843,8 @@ pub fn instr_DA_5_reg_jit(ctx: &mut JitContext, r: u32) {
 }

 pub fn instr_DB_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
-    codegen::gen_modrm_resolve_safe_read32(ctx, modrm_byte);
-    ctx.builder.convert_i32_to_f64();
-    ctx.builder.call_fn1_f64("fpu_push");
+    codegen::gen_fpu_load_i32(ctx, modrm_byte);
+    ctx.builder.call_fn2_i64_i32("fpu_push");
 }
 pub fn instr_DB_0_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fn1_const(ctx.builder, "instr_DB_0_reg", r);
@ -2860,7 +2854,7 @@ pub fn instr_DB_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.call_fn1_f64_ret("fpu_convert_to_i32");
+    ctx.builder.call_fn2_i64_i32_ret("fpu_convert_to_i32");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write32(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -2873,7 +2867,7 @@ pub fn instr_DB_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.call_fn1_f64_ret("fpu_convert_to_i32");
+    ctx.builder.call_fn2_i64_i32_ret("fpu_convert_to_i32");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write32(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -2919,12 +2913,12 @@ pub fn instr_DB_6_reg_jit(ctx: &mut JitContext, r: u32) {
 fn instr_group_DC_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte, op: &str) {
    ctx.builder.const_i32(0);
    codegen::gen_fpu_load_m64(ctx, modrm_byte);
-    ctx.builder.call_fn2_i32_f64(op)
+    ctx.builder.call_fn3_i32_i64_i32(op)
 }
 fn instr_group_DC_reg_jit(ctx: &mut JitContext, r: u32, op: &str) {
    ctx.builder.const_i32(r as i32);
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn2_i32_f64(op)
+    ctx.builder.call_fn3_i32_i64_i32(op)
 }

 pub fn instr_DC_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
@ -2941,19 +2935,19 @@ pub fn instr_DC_1_reg_jit(ctx: &mut JitContext, r: u32) {
 }
 pub fn instr_DC_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m64(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_fcom")
+    ctx.builder.call_fn2_i64_i32("fpu_fcom")
 }
 pub fn instr_DC_2_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_fcom")
+    ctx.builder.call_fn2_i64_i32("fpu_fcom")
 }
 pub fn instr_DC_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m64(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_fcomp")
+    ctx.builder.call_fn2_i64_i32("fpu_fcomp")
 }
 pub fn instr_DC_3_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_fcomp")
+    ctx.builder.call_fn2_i64_i32("fpu_fcomp")
 }
 pub fn instr_DC_4_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    instr_group_DC_mem_jit(ctx, modrm_byte, "fpu_fsub")
@ -2982,7 +2976,7 @@ pub fn instr_DC_7_reg_jit(ctx: &mut JitContext, r: u32) {

 pub fn instr16_DD_0_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_fpu_load_m64(ctx, modrm_byte);
-    ctx.builder.call_fn1_f64("fpu_push");
+    ctx.builder.call_fn2_i64_i32("fpu_push");
 }
 pub fn instr16_DD_0_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fn1_const(ctx.builder, "fpu_ffree", r);
@ -2996,7 +2990,7 @@ pub fn instr16_DD_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.reinterpret_f64_as_i64();
+    ctx.builder.call_fn2_i64_i32_ret_i64("f80_to_f64");
    let value_local = ctx.builder.set_new_local_i64();
    codegen::gen_safe_write64(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -3014,7 +3008,7 @@ pub fn instr16_DD_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.reinterpret_f64_as_i64();
+    ctx.builder.call_fn2_i64_i32_ret_i64("f80_to_f64");
    let value_local = ctx.builder.set_new_local_i64();
    codegen::gen_safe_write64(ctx, &address_local, &value_local);
    codegen::gen_fn0_const(ctx.builder, "fpu_pop");
@ -3044,15 +3038,13 @@ pub fn instr32_DD_5_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {

 fn instr_group_DE_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte, op: &str) {
    ctx.builder.const_i32(0);
-    codegen::gen_modrm_resolve_safe_read16(ctx, modrm_byte);
-    codegen::sign_extend_i16(ctx.builder);
-    ctx.builder.convert_i32_to_f64();
-    ctx.builder.call_fn2_i32_f64(op)
+    codegen::gen_fpu_load_i16(ctx, modrm_byte);
+    ctx.builder.call_fn3_i32_i64_i32(op)
 }
 fn instr_group_DE_reg_jit(ctx: &mut JitContext, r: u32, op: &str) {
    ctx.builder.const_i32(r as i32);
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn2_i32_f64(op);
+    ctx.builder.call_fn3_i32_i64_i32(op);
    codegen::gen_fn0_const(ctx.builder, "fpu_pop")
 }

@ -3069,26 +3061,22 @@ pub fn instr_DE_1_reg_jit(ctx: &mut JitContext, r: u32) {
    instr_group_DE_reg_jit(ctx, r, "fpu_fmul")
 }
 pub fn instr_DE_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
-    codegen::gen_modrm_resolve_safe_read16(ctx, modrm_byte);
-    codegen::sign_extend_i16(ctx.builder);
-    ctx.builder.convert_i32_to_f64();
-    ctx.builder.call_fn1_f64("fpu_fcom")
+    codegen::gen_fpu_load_i16(ctx, modrm_byte);
+    ctx.builder.call_fn2_i64_i32("fpu_fcom")
 }
 pub fn instr_DE_2_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fpu_get_sti(ctx, r);
-    ctx.builder.call_fn1_f64("fpu_fcom");
+    ctx.builder.call_fn2_i64_i32("fpu_fcom");
    codegen::gen_fn0_const(ctx.builder, "fpu_pop")
 }
 pub fn instr_DE_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
-    codegen::gen_modrm_resolve_safe_read16(ctx, modrm_byte);
-    codegen::sign_extend_i16(ctx.builder);
-    ctx.builder.convert_i32_to_f64();
-    ctx.builder.call_fn1_f64("fpu_fcomp")
+    codegen::gen_fpu_load_i16(ctx, modrm_byte);
+    ctx.builder.call_fn2_i64_i32("fpu_fcomp")
 }
 pub fn instr_DE_3_reg_jit(ctx: &mut JitContext, r: u32) {
    if r == 1 {
        codegen::gen_fpu_get_sti(ctx, r);
-        ctx.builder.call_fn1_f64("fpu_fcomp");
+        ctx.builder.call_fn2_i64_i32("fpu_fcomp");
        codegen::gen_fn0_const(ctx.builder, "fpu_pop")
    }
    else {
@ -3124,7 +3112,7 @@ pub fn instr_DF_2_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.call_fn1_f64_ret("fpu_convert_to_i16");
+    ctx.builder.call_fn2_i64_i32_ret("fpu_convert_to_i16");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write16(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -3137,7 +3125,7 @@ pub fn instr_DF_3_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.call_fn1_f64_ret("fpu_convert_to_i16");
+    ctx.builder.call_fn2_i64_i32_ret("fpu_convert_to_i16");
    let value_local = ctx.builder.set_new_local();
    codegen::gen_safe_write16(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
@ -3164,9 +3152,8 @@ pub fn instr_DF_4_reg_jit(ctx: &mut JitContext, r: u32) {
 }

 pub fn instr_DF_5_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
-    codegen::gen_modrm_resolve_safe_read64(ctx, modrm_byte);
-    ctx.builder.convert_i64_to_f64();
-    ctx.builder.call_fn1_f64("fpu_push");
+    codegen::gen_fpu_load_i64(ctx, modrm_byte);
+    ctx.builder.call_fn2_i64_i32("fpu_push");
 }
 pub fn instr_DF_5_reg_jit(ctx: &mut JitContext, r: u32) {
    codegen::gen_fn1_const(ctx.builder, "fpu_fucomip", r);
@ -3177,7 +3164,7 @@ pub fn instr_DF_7_mem_jit(ctx: &mut JitContext, modrm_byte: ModrmByte) {
    codegen::gen_modrm_resolve(ctx, modrm_byte);
    let address_local = ctx.builder.set_new_local();
    codegen::gen_fpu_get_sti(ctx, 0);
-    ctx.builder.call_fn1_f64_ret_i64("fpu_convert_to_i64");
+    ctx.builder.call_fn2_i64_i32_ret_i64("fpu_convert_to_i64");
    let value_local = ctx.builder.set_new_local_i64();
    codegen::gen_safe_write64(ctx, &address_local, &value_local);
    ctx.builder.free_local(address_local);
--- a/src/rust/lib.rs
+++ b/src/rust/lib.rs
@ -30,6 +30,7 @@ mod opstats;
 mod page;
 mod prefix;
 mod regs;
+mod softfloat;
 mod state_flags;
 mod util;
 mod wasmgen;
--- a/src/rust/softfloat.rs
+++ b/src/rust/softfloat.rs
@ -0,0 +1,305 @@
+extern "C" {
+    fn extF80M_add(x: *const F80, y: *const F80, ptr: *mut F80);
+    fn extF80M_sub(x: *const F80, y: *const F80, ptr: *mut F80);
+    fn extF80M_mul(x: *const F80, y: *const F80, ptr: *mut F80);
+    fn extF80M_div(x: *const F80, y: *const F80, ptr: *mut F80);
+    //fn extF80M_rem(x: *const F80, y: *const F80, ptr: *mut F80);
+    fn extF80M_sqrt(x: *const F80, ptr: *mut F80);
+
+    fn extF80M_roundToInt(x: *const F80, rounding_mode: u8, raise_inexact: bool, dst: *mut F80);
+
+    fn extF80M_eq(x: *const F80, y: *const F80) -> bool;
+    //fn extF80M_eq_signaling(x: *const F80, y: *const F80) -> bool;
+
+    //fn extF80M_le(x: *const F80, y: *const F80) -> bool;
+    //fn extF80M_le_quiet(x: *const F80, y: *const F80) -> bool;
+    fn extF80M_lt(x: *const F80, y: *const F80) -> bool;
+    fn extF80M_lt_quiet(x: *const F80, y: *const F80) -> bool;
+
+    fn extF80M_to_i32(src: *const F80, rounding_mode: u8, raise_inexact: bool) -> i32;
+    fn extF80M_to_i64(src: *const F80, rounding_mode: u8, raise_inexact: bool) -> i64;
+    fn i32_to_extF80M(src: i32, dst: *mut F80);
+    fn i64_to_extF80M(src: i64, dst: *mut F80);
+
+    fn f32_to_extF80M(src: i32, dst: *mut F80);
+    fn f64_to_extF80M(src: u64, dst: *mut F80);
+    fn extF80M_to_f32(src: *const F80) -> i32;
+    fn extF80M_to_f64(src: *const F80) -> u64;
+
+    static mut softfloat_roundingMode: u8;
+    static mut extF80_roundingPrecision: u8;
+    static mut softfloat_exceptionFlags: u8;
+}
+
+pub enum RoundingMode {
+    NearEven,
+    Trunc,
+    Floor,
+    Ceil,
+}
+pub enum Precision {
+    P80,
+    P64,
+    P32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct F80 {
+    pub mantissa: u64,
+    pub sign_exponent: u16,
+}
+impl F80 {
+    pub const ZERO: F80 = F80 {
+        mantissa: 0,
+        sign_exponent: 0,
+    };
+    pub const ONE: F80 = F80 {
+        mantissa: 0x8000000000000000,
+        sign_exponent: 0x3FFF,
+    };
+    pub const LN_10: F80 = F80 {
+        mantissa: 0x935D8DDDAAA8B000,
+        sign_exponent: 0x4000,
+    };
+    pub const LN_2: F80 = F80 {
+        mantissa: 0xB17217F7D1CF7800,
+        sign_exponent: 0x3FFE,
+    };
+    pub const PI: F80 = F80 {
+        mantissa: 0xC90FDAA22168C000,
+        sign_exponent: 0x4000,
+    };
+    pub const LOG2_E: F80 = F80 {
+        mantissa: 0xB8AA3B295C17F000,
+        sign_exponent: 0x3FFF,
+    };
+    pub const INDEFINITE_NAN: F80 = F80 {
+        mantissa: 0xC000000000000000,
+        sign_exponent: 0x7FFF,
+    };
+    pub const POS_INFINITY: F80 = F80 {
+        mantissa: 0x8000000000000000,
+        sign_exponent: 0x7FFF,
+    };
+    pub const NEG_INFINITY: F80 = F80 {
+        mantissa: 0x8000000000000000,
+        sign_exponent: 0xFFFF,
+    };
+
+    pub fn sign(&self) -> bool { (self.sign_exponent >> 15) == 1 }
+    pub fn exponent(&self) -> i16 { (self.sign_exponent as i16 & 0x7FFF) - 0x3FFF }
+
+    pub fn of_i32(src: i32) -> F80 {
+        let mut x = F80::ZERO;
+        unsafe {
+            i32_to_extF80M(src, &mut x)
+        };
+        x
+    }
+    pub fn of_i64(src: i64) -> F80 {
+        let mut x = F80::ZERO;
+        unsafe {
+            i64_to_extF80M(src, &mut x)
+        };
+        x
+    }
+
+    pub fn of_f32(src: i32) -> F80 {
+        let mut x = F80::ZERO;
+        unsafe {
+            f32_to_extF80M(src, &mut x)
+        };
+        x
+    }
+
+    pub fn of_f64(src: u64) -> F80 {
+        let mut x = F80::ZERO;
+        unsafe {
+            f64_to_extF80M(src, &mut x)
+        };
+        x
+    }
+    fn of_f64x(src: f64) -> F80 { F80::of_f64(unsafe { std::mem::transmute(src) }) }
+
+    pub fn to_f32(&self) -> i32 { unsafe { extF80M_to_f32(self) } }
+    pub fn to_f64(&self) -> u64 { unsafe { extF80M_to_f64(self) } }
+    fn to_f64x(&self) -> f64 { unsafe { std::mem::transmute(extF80M_to_f64(self)) } }
+
+    pub fn to_i32(&self) -> i32 { unsafe { extF80M_to_i32(self, softfloat_roundingMode, false) } }
+    pub fn to_i64(&self) -> i64 { unsafe { extF80M_to_i64(self, softfloat_roundingMode, false) } }
+
+    pub fn cos(self) -> F80 { F80::of_f64x(self.to_f64x().cos()) }
+    pub fn sin(self) -> F80 { F80::of_f64x(self.to_f64x().sin()) }
+    pub fn tan(self) -> F80 { F80::of_f64x(self.to_f64x().tan()) }
+    pub fn atan(self) -> F80 { F80::of_f64x(self.to_f64x().atan()) }
+    pub fn atan2(self, other: F80) -> F80 { F80::of_f64x(self.to_f64x().atan2(other.to_f64x())) }
+
+    pub fn log2(self) -> F80 { F80::of_f64x(self.to_f64x().log2()) }
+    pub fn ln(self) -> F80 { F80::of_f64x(self.to_f64x().ln()) }
+
+    pub fn abs(self) -> F80 {
+        F80 {
+            mantissa: self.mantissa,
+            sign_exponent: self.sign_exponent & !0x8000,
+        }
+    }
+    pub fn two_pow(self) -> F80 { F80::of_f64x(2.0f64.powf(self.to_f64x())) }
+    pub fn round(self) -> F80 {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_roundToInt(&self, softfloat_roundingMode, false, &mut result)
+        };
+        result
+    }
+    pub fn trunc(self) -> F80 {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_roundToInt(&self, 1, false, &mut result)
+        };
+        result
+    }
+
+    pub fn sqrt(self) -> F80 {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_sqrt(&self, &mut result)
+        };
+        result
+    }
+
+    pub fn is_finite(self) -> bool {
+        // TODO: Can probably be done more efficiently
+        self != F80::POS_INFINITY && self != F80::NEG_INFINITY
+    }
+    pub fn is_nan(self) -> bool {
+        // TODO: Can probably be done more efficiently
+        self != self
+    }
+
+    pub fn set_rounding_mode(mode: RoundingMode) {
+        unsafe {
+            softfloat_roundingMode = match mode {
+                RoundingMode::NearEven => 0,
+                RoundingMode::Trunc => 1,
+                RoundingMode::Floor => 2,
+                RoundingMode::Ceil => 3,
+            }
+        };
+    }
+    pub fn set_precision(precision: Precision) {
+        unsafe {
+            extF80_roundingPrecision = match precision {
+                Precision::P80 => 80,
+                Precision::P64 => 64,
+                Precision::P32 => 32,
+            }
+        };
+    }
+
+    pub fn get_exception_flags() -> u8 {
+        let f = unsafe { softfloat_exceptionFlags };
+        // translate softfloat's flags to x87 status flags
+        f >> 4 & 1 | f >> 1 & 4 | f << 3 & 16
+    }
+    pub fn clear_exception_flags() { unsafe { softfloat_exceptionFlags = 0 } }
+
+    pub fn partial_cmp_quiet(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        // TODO: Can probably be done more efficiently
+        if unsafe { extF80M_lt_quiet(self, other) } {
+            Some(std::cmp::Ordering::Less)
+        }
+        else if unsafe { extF80M_lt_quiet(other, self) } {
+            Some(std::cmp::Ordering::Greater)
+        }
+        else if self == other {
+            Some(std::cmp::Ordering::Equal)
+        }
+        else {
+            None
+        }
+    }
+}
+
+impl std::ops::Add for F80 {
+    type Output = F80;
+    fn add(self, other: Self) -> Self {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_add(&self, &other, &mut result)
+        };
+        result
+    }
+}
+impl std::ops::Sub for F80 {
+    type Output = F80;
+    fn sub(self, other: Self) -> Self {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_sub(&self, &other, &mut result)
+        };
+        result
+    }
+}
+impl std::ops::Neg for F80 {
+    type Output = F80;
+    fn neg(self) -> Self {
+        let mut result = self;
+        result.sign_exponent ^= 1 << 15;
+        result
+    }
+}
+impl std::ops::Mul for F80 {
+    type Output = F80;
+    fn mul(self, other: Self) -> Self {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_mul(&self, &other, &mut result)
+        };
+        result
+    }
+}
+impl std::ops::Div for F80 {
+    type Output = F80;
+    fn div(self, other: Self) -> Self {
+        let mut result = F80::ZERO;
+        unsafe {
+            extF80M_div(&self, &other, &mut result)
+        };
+        result
+    }
+}
+impl std::ops::Rem for F80 {
+    type Output = F80;
+    fn rem(self, other: Self) -> Self {
+        let quot = (self / other).trunc();
+        self - quot * other
+        // Uses round-to-nearest instead of truncation
+        //let mut result = F80::ZERO;
+        //unsafe {
+        //    extF80M_rem(&self, &other, &mut result)
+        //};
+        //result
+    }
+}
+
+impl PartialEq for F80 {
+    fn eq(&self, other: &Self) -> bool { unsafe { extF80M_eq(self, other) } }
+}
+impl PartialOrd for F80 {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        // TODO: Can probably be done more efficiently
+        if unsafe { extF80M_lt(self, other) } {
+            Some(std::cmp::Ordering::Less)
+        }
+        else if unsafe { extF80M_lt(other, self) } {
+            Some(std::cmp::Ordering::Greater)
+        }
+        else if self == other {
+            Some(std::cmp::Ordering::Equal)
+        }
+        else {
+            None
+        }
+    }
+}
--- a/src/rust/wasmgen/wasm_builder.rs
+++ b/src/rust/wasmgen/wasm_builder.rs
@ -18,18 +18,17 @@ enum FunctionType {
    FN1_RET_TYPE_INDEX,
    FN2_RET_TYPE_INDEX,

-    FN1_RET_F64_TYPE_INDEX,
    FN1_RET_I64_TYPE_INDEX,
-    FN2_I32_F64_TYPE_INDEX,
+    FN2_I32_I64_TYPE_INDEX,
    FN2_I64_I32_TYPE_INDEX,
-    FN1_F64_TYPE_INDEX,
-    FN1_F64_RET_I32_TYPE_INDEX,
-    FN1_F64_RET_I64_TYPE_INDEX,
+    FN2_I64_I32_RET_TYPE_INDEX,
+    FN2_I64_I32_RET_I64_TYPE_INDEX,

    FN3_RET_TYPE_INDEX,
    FN4_RET_TYPE_INDEX,

    FN3_I64_I32_I32_TYPE_INDEX,
+    FN3_I32_I64_I32_TYPE_INDEX,
    FN3_I32_I64_I32_RET_TYPE_INDEX,
    FN4_I32_I64_I64_I32_RET_TYPE_INDEX,
    // When adding at the end, update LAST below
@ -269,13 +268,6 @@ impl WasmBuilder {
                    self.output.push(1);
                    self.output.push(op::TYPE_I32);
                },
-                FunctionType::FN1_RET_F64_TYPE_INDEX => {
-                    self.output.push(op::TYPE_FUNC);
-                    self.output.push(1);
-                    self.output.push(op::TYPE_I32);
-                    self.output.push(1);
-                    self.output.push(op::TYPE_F64);
-                },
                FunctionType::FN1_RET_I64_TYPE_INDEX => {
                    self.output.push(op::TYPE_FUNC);
                    self.output.push(1);
@ -283,11 +275,11 @@ impl WasmBuilder {
                    self.output.push(1);
                    self.output.push(op::TYPE_I64);
                },
-                FunctionType::FN2_I32_F64_TYPE_INDEX => {
+                FunctionType::FN2_I32_I64_TYPE_INDEX => {
                    self.output.push(op::TYPE_FUNC);
                    self.output.push(2);
                    self.output.push(op::TYPE_I32);
-                    self.output.push(op::TYPE_F64);
+                    self.output.push(op::TYPE_I64);
                    self.output.push(0);
                },
                FunctionType::FN2_I64_I32_TYPE_INDEX => {
@ -297,23 +289,19 @@ impl WasmBuilder {
                    self.output.push(op::TYPE_I32);
                    self.output.push(0);
                },
-                FunctionType::FN1_F64_TYPE_INDEX => {
+                FunctionType::FN2_I64_I32_RET_TYPE_INDEX => {
                    self.output.push(op::TYPE_FUNC);
-                    self.output.push(1);
-                    self.output.push(op::TYPE_F64);
-                    self.output.push(0);
-                },
-                FunctionType::FN1_F64_RET_I32_TYPE_INDEX => {
-                    self.output.push(op::TYPE_FUNC);
-                    self.output.push(1);
-                    self.output.push(op::TYPE_F64);
+                    self.output.push(2);
+                    self.output.push(op::TYPE_I64);
+                    self.output.push(op::TYPE_I32);
                    self.output.push(1);
                    self.output.push(op::TYPE_I32);
                },
-                FunctionType::FN1_F64_RET_I64_TYPE_INDEX => {
+                FunctionType::FN2_I64_I32_RET_I64_TYPE_INDEX => {
                    self.output.push(op::TYPE_FUNC);
-                    self.output.push(1);
-                    self.output.push(op::TYPE_F64);
+                    self.output.push(2);
+                    self.output.push(op::TYPE_I64);
+                    self.output.push(op::TYPE_I32);
                    self.output.push(1);
                    self.output.push(op::TYPE_I64);
                },
@ -344,6 +332,14 @@ impl WasmBuilder {
                    self.output.push(op::TYPE_I32);
                    self.output.push(0);
                },
+                FunctionType::FN3_I32_I64_I32_TYPE_INDEX => {
+                    self.output.push(op::TYPE_FUNC);
+                    self.output.push(3);
+                    self.output.push(op::TYPE_I32);
+                    self.output.push(op::TYPE_I64);
+                    self.output.push(op::TYPE_I32);
+                    self.output.push(0);
+                },
                FunctionType::FN3_I32_I64_I32_RET_TYPE_INDEX => {
                    self.output.push(op::TYPE_FUNC);
                    self.output.push(3);
@ -608,6 +604,13 @@ impl WasmBuilder {
        self.const_i32(addr as i32);
        self.load_aligned_i32(0);
    }
+    pub fn load_fixed_i64(&mut self, addr: u32) {
+        // doesn't cause a failure in the generated code, but it will be much slower
+        dbg_assert!((addr & 7) == 0);
+
+        self.const_i32(addr as i32);
+        self.load_aligned_i64(0);
+    }

    pub fn load_u8(&mut self, byte_offset: u32) {
        self.instruction_body.push(op::OP_I32LOAD8U);
@ -657,11 +660,11 @@ impl WasmBuilder {
        write_leb_u32(&mut self.instruction_body, byte_offset);
    }

-    pub fn store_aligned_u16(&mut self, byte_offset: u32) {
-        self.instruction_body.push(op::OP_I32STORE16);
-        self.instruction_body.push(op::MEM_ALIGN16);
-        write_leb_u32(&mut self.instruction_body, byte_offset);
-    }
+    //pub fn store_aligned_u16(&mut self, byte_offset: u32) {
+    //    self.instruction_body.push(op::OP_I32STORE16);
+    //    self.instruction_body.push(op::MEM_ALIGN16);
+    //    write_leb_u32(&mut self.instruction_body, byte_offset);
+    //}

    pub fn store_aligned_i32(&mut self, byte_offset: u32) {
        self.instruction_body.push(op::OP_I32STORE);
@ -733,22 +736,22 @@ impl WasmBuilder {

    pub fn ltu_i32(&mut self) { self.instruction_body.push(op::OP_I32LTU); }

-    pub fn reinterpret_i32_as_f32(&mut self) {
-        self.instruction_body.push(op::OP_F32REINTERPRETI32);
-    }
-    pub fn reinterpret_f32_as_i32(&mut self) {
-        self.instruction_body.push(op::OP_I32REINTERPRETF32);
-    }
-    pub fn reinterpret_i64_as_f64(&mut self) {
-        self.instruction_body.push(op::OP_F64REINTERPRETI64);
-    }
-    pub fn reinterpret_f64_as_i64(&mut self) {
-        self.instruction_body.push(op::OP_I64REINTERPRETF64);
-    }
-    pub fn promote_f32_to_f64(&mut self) { self.instruction_body.push(op::OP_F64PROMOTEF32); }
-    pub fn demote_f64_to_f32(&mut self) { self.instruction_body.push(op::OP_F32DEMOTEF64); }
-    pub fn convert_i32_to_f64(&mut self) { self.instruction_body.push(op::OP_F64CONVERTSI32); }
-    pub fn convert_i64_to_f64(&mut self) { self.instruction_body.push(op::OP_F64CONVERTSI64); }
+    //pub fn reinterpret_i32_as_f32(&mut self) {
+    //    self.instruction_body.push(op::OP_F32REINTERPRETI32);
+    //}
+    //pub fn reinterpret_f32_as_i32(&mut self) {
+    //    self.instruction_body.push(op::OP_I32REINTERPRETF32);
+    //}
+    //pub fn reinterpret_i64_as_f64(&mut self) {
+    //    self.instruction_body.push(op::OP_F64REINTERPRETI64);
+    //}
+    //pub fn reinterpret_f64_as_i64(&mut self) {
+    //    self.instruction_body.push(op::OP_I64REINTERPRETF64);
+    //}
+    //pub fn promote_f32_to_f64(&mut self) { self.instruction_body.push(op::OP_F64PROMOTEF32); }
+    //pub fn demote_f64_to_f32(&mut self) { self.instruction_body.push(op::OP_F32DEMOTEF64); }
+    //pub fn convert_i32_to_f64(&mut self) { self.instruction_body.push(op::OP_F64CONVERTSI32); }
+    //pub fn convert_i64_to_f64(&mut self) { self.instruction_body.push(op::OP_F64CONVERTSI64); }
    pub fn extend_unsigned_i32_to_i64(&mut self) {
        self.instruction_body.push(op::OP_I64EXTENDUI32);
    }
@ -837,25 +840,19 @@ impl WasmBuilder {
    pub fn call_fn1_ret_i64(&mut self, name: &str) {
        self.call_fn(name, FunctionType::FN1_RET_I64_TYPE_INDEX)
    }
-    pub fn call_fn1_ret_f64(&mut self, name: &str) {
-        self.call_fn(name, FunctionType::FN1_RET_F64_TYPE_INDEX)
-    }
-    pub fn call_fn1_f64_ret(&mut self, name: &str) {
-        self.call_fn(name, FunctionType::FN1_F64_RET_I32_TYPE_INDEX)
-    }
-    pub fn call_fn1_f64_ret_i64(&mut self, name: &str) {
-        self.call_fn(name, FunctionType::FN1_F64_RET_I64_TYPE_INDEX)
-    }
-    pub fn call_fn1_f64(&mut self, name: &str) {
-        self.call_fn(name, FunctionType::FN1_F64_TYPE_INDEX)
-    }
    pub fn call_fn2(&mut self, name: &str) { self.call_fn(name, FunctionType::FN2_TYPE_INDEX) }
-    pub fn call_fn2_i32_f64(&mut self, name: &str) {
-        self.call_fn(name, FunctionType::FN2_I32_F64_TYPE_INDEX)
+    pub fn call_fn2_i32_i64(&mut self, name: &str) {
+        self.call_fn(name, FunctionType::FN2_I32_I64_TYPE_INDEX)
    }
    pub fn call_fn2_i64_i32(&mut self, name: &str) {
        self.call_fn(name, FunctionType::FN2_I64_I32_TYPE_INDEX)
    }
+    pub fn call_fn2_i64_i32_ret(&mut self, name: &str) {
+        self.call_fn(name, FunctionType::FN2_I64_I32_RET_TYPE_INDEX)
+    }
+    pub fn call_fn2_i64_i32_ret_i64(&mut self, name: &str) {
+        self.call_fn(name, FunctionType::FN2_I64_I32_RET_I64_TYPE_INDEX)
+    }
    pub fn call_fn2_ret(&mut self, name: &str) {
        self.call_fn(name, FunctionType::FN2_RET_TYPE_INDEX)
    }
@ -866,6 +863,9 @@ impl WasmBuilder {
    pub fn call_fn3_i64_i32_i32(&mut self, name: &str) {
        self.call_fn(name, FunctionType::FN3_I64_I32_I32_TYPE_INDEX)
    }
+    pub fn call_fn3_i32_i64_i32(&mut self, name: &str) {
+        self.call_fn(name, FunctionType::FN3_I32_I64_I32_TYPE_INDEX)
+    }
    pub fn call_fn3_i32_i64_i32_ret(&mut self, name: &str) {
        self.call_fn(name, FunctionType::FN3_I32_I64_I32_RET_TYPE_INDEX)
    }
--- a/tests/nasm/run.js
+++ b/tests/nasm/run.js
@ -366,8 +366,8 @@ else {
        emulator.stop();
        var cpu = emulator.v86.cpu;

-        const evaluated_fpu_regs = new Float64Array(8).map((_, i) => cpu.fpu_st[i + cpu.fpu_stack_ptr[0] & 7]);
-        const evaluated_mmxs = cpu.reg_mmxs;
+        const evaluated_fpu_regs = new Float64Array(8).map((_, i) => cpu.fpu_get_sti_f64(i));
+        const evaluated_mmxs = new Int32Array(16).map((_, i) => cpu.fpu_st[(i & ~1) << 1 | (i & 1)]);
        const evaluated_xmms = cpu.reg_xmm32s;
        const evaluated_memory = new Int32Array(cpu.mem8.slice(0x120000 - 16 * 4, 0x120000).buffer);
        const evaluated_fpu_tag = cpu.fpu_load_tag_word();
@ -436,7 +436,7 @@ else {
                for (let i = 0; i < evaluated_mmxs.length; i++) {
                    if (evaluated_mmxs[i] !== expected_mmx_registers[i]) {
                        individual_failures.push({
-                            name: "mm" + (i >> 1) + ".int32[" + (i & 1) + "] (cpu.reg_mmx[" + i + "])",
+                            name: "mm" + (i >> 1) + ".int32[" + (i & 1) + "]",
                            expected: expected_mmx_registers[i],
                            actual: evaluated_mmxs[i],
                        });
--- a/tests/qemu/test-i386.c
+++ b/tests/qemu/test-i386.c
@ -882,15 +882,12 @@ void test_fops(double a, double b)
    int ib = (int)b;
    int dest = 0;

-    // XXX: Tests below are disabled since libc (which is statically linked)
-    //      contains sse instructions, some of which aren't supported.
-
    printf("a=%f b=%f a+b=%f\n", a, b, a + b);
    printf("a=%f b=%f a-b=%f\n", a, b, a - b);
    printf("a=%f b=%f a*b=%f\n", a, b, a * b);
    printf("a=%f b=%f a/b=%f\n", a, b, a / b);
    printf("a=%f b=%f =%f\n", a, b, a + a + a + 3 * b / a * (a * a * a / b / b / (a + 1.0) - 3.5 + a * b / (3.7 * a / (a - b * b) + 6.5 * a / (b * b * a / -b - a * b) + 5.5 * (b - a))));
-    printf("a=%f b=%f fmod(a, b)=%f\n", a, b, fmod(a, b));
+    //printf("a=%f b=%f fmod(a, b)=%f\n", a, b, fmod(a, b)); // difference in sign bit on zero and nan
    printf("a=%f fma(a,b,a)=%f\n", a, fma(a, b, a));
    printf("a=%f fdim(a,b)=%f\n", a, fdim(a, b));
    printf("a=%f copysign(a,b)=%f\n", a, copysign(a, b));
@ -898,10 +895,10 @@ void test_fops(double a, double b)
    printf("a=%f sin(a)=%f\n", a, sin(a));
    printf("a=%f cos(a)=%f\n", a, cos(a));
    printf("a=%f tan(a)=%f\n", a, tan(a));
-    //printf("a=%f log(a)=%f\n", a, log(a));
-    //printf("a=%f log10(a)=%f\n", a, log10(a));
-    //printf("a=%f log1p(a)=%f\n", a, log1p(a));
-    //printf("a=%f log2(a)=%f\n", a, log2(a));
+    printf("a=%f log(a)=%f\n", a, log(a));
+    printf("a=%f log10(a)=%f\n", a, log10(a));
+    printf("a=%f log1p(a)=%f\n", a, log1p(a));
+    printf("a=%f log2(a)=%f\n", a, log2(a));
    printf("a=%f logb(a)=%f\n", a, logb(a));
    printf("a=%f ilogb(a)=%d\n", a, ilogb(a));
    printf("a=%f exp(a)=%f\n", a, exp(a));
@ -909,11 +906,11 @@ void test_fops(double a, double b)
    printf("a=%f frexp(a)=%f, %d\n", a, frexp(a, &dest), dest);
    printf("a=%f ldexp(a,b)=%f\n", a, ldexp(a, ib));
    printf("a=%f scalbn(a,b)=%f\n", a, scalbn(a, ib));
-    //printf("a=%f sinh(a)=%f\n", a, sinh(a));
-    //printf("a=%f cosh(a)=%f\n", a, cosh(a));
+    printf("a=%f sinh(a)=%f\n", a, sinh(a));
+    printf("a=%f cosh(a)=%f\n", a, cosh(a));
    printf("a=%f tanh(a)=%f\n", a, tanh(a));
    printf("a=%f fabs(a)=%f\n", a, fabs(a));
-    //printf("a=%f pow(a,b)=%f\n", a, pow(a,b));
+    printf("a=%f pow(a,b)=%f\n", a, pow(a,b));
    printf("a=%f b=%f atan2(a, b)=%f\n", a, b, atan2(a, b));
    /* just to test some op combining */
    printf("a=%f asin(sin(a))=%f\n", a, asin(sin(a)));
@ -1005,7 +1002,7 @@ void test_fcvt(double a)
    fa = a;
    la = a;
    printf("(float)%f = %f\n", a, fa);
-    //printf("(long double)%f = %Lf\n", a, la); // XXX: currently broken for infinity
+    printf("(long double)%f = %Lf\n", a, la);
    printf("a=" FMT64X "\n", *(uint64_t *)&a);
    printf("la=" FMT64X " %04x\n", *(uint64_t *)&la,
           *(unsigned short *)((char *)(&la) + 8));
@ -1098,10 +1095,11 @@ void test_fenv(void)
    for(i=0;i<8;i++)
        dtab[i] = i + 1;

+    asm volatile ("fninit");
    //TEST_ENV(&float_env16, "data16 fnstenv", "data16 fldenv");
    //TEST_ENV(&float_env16, "data16 fnsave", "data16 frstor");
-    //TEST_ENV(&float_env32, "fnstenv", "fldenv"); // XXX: Temporarily disabled
-    //TEST_ENV(&float_env32, "fnsave", "frstor"); // XXX: Temporarily disabled
+    TEST_ENV(&float_env32, "fnstenv", "fldenv");
+    TEST_ENV(&float_env32, "fnsave", "frstor");

    /* test for ffree */
    for(i=0;i<5;i++)