From f72d0a9ca080494230cc6d2dd87f4a429c764dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Marczewski?= Date: Fri, 14 Jan 2022 21:19:54 +0100 Subject: [PATCH] Add support for PAE (#599) Physical memory is still limited to 32-bit addresses, but systems that enable PAE should work now. --- src/browser/main.js | 2 +- src/const.js | 1 + src/debug.js | 60 ++++++++--- src/rust/cpu/cpu.rs | 122 ++++++++++++++++++++--- src/rust/cpu/instructions_0f.rs | 7 +- src/rust/cpu/memory.rs | 13 +++ tests/kvm-unit-tests/README.md | 1 + tests/kvm-unit-tests/x86/Makefile.common | 1 + tests/kvm-unit-tests/x86/pae.c | 101 +++++++++++++++++++ 9 files changed, 271 insertions(+), 37 deletions(-) create mode 100644 tests/kvm-unit-tests/x86/pae.c diff --git a/src/browser/main.js b/src/browser/main.js index f8985140..f21644ca 100644 --- a/src/browser/main.js +++ b/src/browser/main.js @@ -1762,7 +1762,7 @@ $("dump_gdt").onclick = debug.dump_gdt_ldt.bind(debug); $("dump_idt").onclick = debug.dump_idt.bind(debug); $("dump_regs").onclick = debug.dump_regs.bind(debug); - $("dump_pt").onclick = debug.dump_page_directory.bind(debug); + $("dump_pt").onclick = debug.dump_page_structures.bind(debug); $("dump_log").onclick = function() { diff --git a/src/const.js b/src/const.js index 75728186..11ddc014 100644 --- a/src/const.js +++ b/src/const.js @@ -120,6 +120,7 @@ var /** @const */ var CR0_PG = 1 << 31; +var CR4_PAE = 1 << 5; // https://github.com/qemu/seabios/blob/14221cd86eadba82255fdc55ed174d401c7a0a04/src/fw/paravirt.c#L205-L219 diff --git a/src/debug.js b/src/debug.js index 6cfba5fc..6aec9718 100644 --- a/src/debug.js +++ b/src/debug.js @@ -39,7 +39,7 @@ CPU.prototype.debug_init = function() debug.dump_state = dump_state; debug.dump_stack = dump_stack; - debug.dump_page_directory = dump_page_directory; + debug.dump_page_structures = dump_page_structures; debug.dump_gdt_ldt = dump_gdt_ldt; debug.dump_idt = dump_idt; @@ -115,6 +115,7 @@ CPU.prototype.debug_init = function() } return ("mode=" + mode + "/" + op_size + " paging=" + (+((cpu.cr[0] & CR0_PG) !== 0)) + + " pae=" + (+((cpu.cr[4] & CR4_PAE) !== 0)) + " iopl=" + iopl + " cpl=" + cpl + " if=" + if_ + " cs:eip=" + cs_eip + " cs_off=" + h(cpu.get_seg_cs() >>> 0, 8) + " flgs=" + h(cpu.get_eflags() >>> 0, 6) + " (" + flag_string + ")" + @@ -297,7 +298,7 @@ CPU.prototype.debug_init = function() } } - function load_page_entry(dword_entry, is_directory) + function load_page_entry(dword_entry, pae, is_directory) { if(!DEBUG) return; @@ -312,7 +313,7 @@ CPU.prototype.debug_init = function() if(size && !is_directory) { - address = dword_entry & 0xFFC00000; + address = dword_entry & (pae ? 0xFFE00000 : 0xFFC00000); } else { @@ -331,19 +332,47 @@ CPU.prototype.debug_init = function() }; } - function dump_page_directory() + var dbg_log = console.log.bind(console); + + function dump_page_structures() { + var pae = !!(cpu.cr[4] & CR4_PAE); + if (pae) + { + dbg_log("PAE enabled"); + + for (var i = 0; i < 4; i++) { + var addr = cpu.cr[3] + 8 * i; + var dword = cpu.read32s(addr); + if (dword & 1) + { + dump_page_directory(dword & 0xFFFFF000, true, i << 30); + } + } + } + else + { + dbg_log("PAE disabled"); + dump_page_directory(cpu.cr[3], false, 0); + } + } + + /* NOTE: PAE entries are 64-bits, we ignore the high half here. */ + function dump_page_directory(pd_addr, pae, start) { if(!DEBUG) return; - for(var i = 0; i < 1024; i++) + var n = pae ? 512 : 1024; + var entry_size = pae ? 8 : 4; + var pd_shift = pae ? 21 : 22; + + for(var i = 0; i < n; i++) { - var addr = cpu.cr[3] + 4 * i; - var dword = cpu.read32s(addr), - entry = load_page_entry(dword, true); + var addr = pd_addr + i * entry_size, + dword = cpu.read32s(addr), + entry = load_page_entry(dword, pae, true); if(!entry) { - dbg_log("Not present: " + h((i << 22) >>> 0, 8)); continue; } @@ -357,20 +386,21 @@ CPU.prototype.debug_init = function() if(entry.size) { - dbg_log("=== " + h((i << 22) >>> 0, 8) + " -> " + h(entry.address >>> 0, 8) + " | " + flags); + dbg_log("=== " + h(start + (i << pd_shift) >>> 0, 8) + " -> " + + h(entry.address >>> 0, 8) + " | " + flags); continue; } else { - dbg_log("=== " + h((i << 22) >>> 0, 8) + " | " + flags); + dbg_log("=== " + h(start + (i << pd_shift) >>> 0, 8) + " | " + flags); } - for(var j = 0; j < 1024; j++) + for(var j = 0; j < n; j++) { - var sub_addr = entry.address + 4 * j; + var sub_addr = entry.address + j * entry_size; dword = cpu.read32s(sub_addr); - var subentry = load_page_entry(dword, false); + var subentry = load_page_entry(dword, pae, false); if(subentry) { @@ -383,7 +413,7 @@ CPU.prototype.debug_init = function() flags += subentry.accessed ? "A " : " "; flags += subentry.dirty ? "Di " : " "; - dbg_log("# " + h((i << 22 | j << 12) >>> 0, 8) + " -> " + + dbg_log("# " + h(start + (i << pd_shift | j << 12) >>> 0, 8) + " -> " + h(subentry.address, 8) + " | " + flags + " (at " + h(sub_addr, 8) + ")"); } } diff --git a/src/rust/cpu/cpu.rs b/src/rust/cpu/cpu.rs index b33b21be..44c44493 100644 --- a/src/rust/cpu/cpu.rs +++ b/src/rust/cpu/cpu.rs @@ -20,8 +20,8 @@ use cpu::global_pointers::*; use cpu::memory; use cpu::memory::mem8; use cpu::memory::{ - in_mapped_range, read8, read16, read32s, read64s, read128, read_aligned32, write8, - write_aligned32, + in_mapped_range, read8, read16, read32s, read64s, read128, read_aligned32, + read_aligned64, write8, write_aligned32, }; use cpu::misc_instr::{ adjust_stack_reg, get_stack_pointer, getaf, getcf, getof, getpf, getsf, getzf, pop16, pop32s, @@ -1797,6 +1797,20 @@ pub unsafe fn do_page_translation(addr: i32, for_writing: bool, user: bool) -> O } } +/* + * 32-bit paging: + * - 10 bits PD | 10 bits PT | 12 bits offset + * - 10 bits PD | 22 bits offset (4MB huge page) + * + * PAE paging: + * - 2 bits PDPT | 9 bits PD | 9 bits PT | 12 bits offset + * - 2 bits PDPT | 9 bits PD | 21 bits offset (2MB huge page) + * + * Note that PAE entries are 64-bit, and can describe physical addresses over 32 + * bits. However, since we support only 32-bit physical addresses, we require + * the high half of the entry to be 0 (except for the execute-disable bit in + * PDE and PTE). + */ pub unsafe fn do_page_walk( addr: i32, for_writing: bool, @@ -1816,16 +1830,25 @@ pub unsafe fn do_page_walk( else { profiler::stat_increment(TLB_MISS); - let page_dir_addr = (*cr.offset(3) as u32 >> 2).wrapping_add((page >> 10) as u32) as i32; - let page_dir_entry = read_aligned32(page_dir_addr as u32); - // XXX - let kernel_write_override = !user && 0 == *cr & CR0_WP; - if 0 == page_dir_entry & PAGE_TABLE_PRESENT_MASK { - // to do at this place: - // - // - set cr2 = addr (which caused the page fault) - // - call_interrupt_vector with id 14, error code 0-7 (requires information if read or write) - // - prevent execution of the function that triggered this call + let pae = *cr.offset(4) & CR4_PAE != 0; + + let (page_dir_addr, page_dir_entry) = + match walk_page_directory(pae, addr) { + Some((a, e)) => (a, e), + // to do at this place: + // + // - set cr2 = addr (which caused the page fault) + // - call_interrupt_vector with id 14, error code 0-7 (requires information if read or write) + // - prevent execution of the function that triggered this call + None => return Err(PageFault { + addr, + for_writing, + user, + present: false, + }), + }; + + if page_dir_entry & PAGE_TABLE_PRESENT_MASK == 0 { return Err(PageFault { addr, for_writing, @@ -1833,6 +1856,9 @@ pub unsafe fn do_page_walk( present: false, }); } + + // XXX + let kernel_write_override = !user && 0 == *cr & CR0_WP; if page_dir_entry & PAGE_TABLE_RW_MASK == 0 && !kernel_write_override { can_write = false; if for_writing { @@ -1868,13 +1894,17 @@ pub unsafe fn do_page_walk( write_aligned32(page_dir_addr as u32, new_page_dir_entry); } - high = (page_dir_entry as u32 & 0xFFC00000 | (addr & 0x3FF000) as u32) as i32; + high = if pae { + (page_dir_entry as u32 & 0xFFE00000 | (addr & 0x1FF000) as u32) as i32 + } else { + (page_dir_entry as u32 & 0xFFC00000 | (addr & 0x3FF000) as u32) as i32 + }; global = page_dir_entry & PAGE_TABLE_GLOBAL_MASK == PAGE_TABLE_GLOBAL_MASK } else { - let page_table_addr = ((page_dir_entry as u32 & 0xFFFFF000) >> 2) - .wrapping_add((page & 1023) as u32) as i32; - let page_table_entry = read_aligned32(page_table_addr as u32); + let (page_table_addr, page_table_entry) = + walk_page_table(pae, addr, page_dir_entry); + if page_table_entry & PAGE_TABLE_PRESENT_MASK == 0 { return Err(PageFault { addr, @@ -1883,6 +1913,7 @@ pub unsafe fn do_page_walk( present: false, }); } + if page_table_entry & PAGE_TABLE_RW_MASK == 0 && !kernel_write_override { can_write = false; if for_writing { @@ -1967,6 +1998,65 @@ pub unsafe fn do_page_walk( return Ok(high); } +unsafe fn walk_page_directory(pae: bool, addr: i32) -> Option<(i32, i32)> { + if pae { + let pdpt_idx = (addr as u32) >> 30; + let page_dir_idx = ((addr as u32) >> 21) & 0x1FF; + + let pdpt_addr = (*cr.offset(3) as u32 >> 2).wrapping_add(pdpt_idx << 1); + let pdpt_entry = read_aligned64(pdpt_addr); + if pdpt_entry as i32 & PAGE_TABLE_PRESENT_MASK == 0 { + return None; + } + dbg_assert!( + pdpt_entry as u64 & 0xFFFF_FFFF_0000_0000 == 0, + "Unsupported: PDPT entry larger than 32 bits" + ); + + let page_dir_addr = ((pdpt_entry as u32 & 0xFFFFF000)>> 2).wrapping_add(page_dir_idx << 1); + let page_dir_entry = read_aligned64(page_dir_addr); + // Note that the highest bit of PDE specifies execute-disable, and can + // be set (we'll ignore it anyway). + dbg_assert!( + page_dir_entry as u64 & 0x7FFF_FFFF_0000_0000 == 0, + "Unsupported: Page directory entry larger than 32 bits" + ); + + return Some((page_dir_addr as i32, page_dir_entry as i32)); + } + + let page_dir_idx = (addr as u32) >> 22; + let page_dir_addr = (*cr.offset(3) as u32 >> 2).wrapping_add(page_dir_idx); + let page_dir_entry = read_aligned32(page_dir_addr); + return Some((page_dir_addr as i32, page_dir_entry)); +} + +unsafe fn walk_page_table( + pae: bool, + addr: i32, + page_dir_entry: i32 +) -> (i32, i32) { + let page_table = (page_dir_entry as u32 & 0xFFFFF000) >> 2; + if pae { + let page_table_idx = (addr as u32 >> 12) & 0x1FF; + let page_table_addr = page_table.wrapping_add(page_table_idx << 1); + let page_table_entry = read_aligned64(page_table_addr); + // Note that the highest bit of PTE specifies execute-disable, and can + // be set (we'll ignore it anyway). + dbg_assert!( + page_table_entry as u64 & 0x7FFF_FFFF_0000_0000 == 0, + "Unsupported: Page table entry larger than 32 bits" + ); + + return (page_table_addr as i32, page_table_entry as i32); + } + + let page_table_idx = (addr as u32 >> 12) & 0x3FF; + let page_table_addr = page_table.wrapping_add(page_table_idx); + let page_table_entry = read_aligned32(page_table_addr); + return (page_table_addr as i32, page_table_entry); +} + #[no_mangle] pub unsafe fn full_clear_tlb() { profiler::stat_increment(FULL_CLEAR_TLB); diff --git a/src/rust/cpu/instructions_0f.rs b/src/rust/cpu/instructions_0f.rs index eb202acc..7d7a867c 100644 --- a/src/rust/cpu/instructions_0f.rs +++ b/src/rust/cpu/instructions_0f.rs @@ -798,13 +798,10 @@ pub unsafe fn instr_0F22(r: i32, creg: i32) { return; } else { - if 0 != (*cr.offset(4) ^ data) & (CR4_PGE | CR4_PSE) { + if 0 != (*cr.offset(4) ^ data) & (CR4_PGE | CR4_PSE | CR4_PAE) { full_clear_tlb(); } *cr.offset(4) = data; - if 0 != *cr.offset(4) & CR4_PAE { - dbg_assert!(false, "PAE is not supported"); - } } }, _ => { @@ -3177,7 +3174,7 @@ pub unsafe fn instr_0FA2() { ecx |= 1 << 31 }; // hypervisor edx = (if true /* have fpu */ { 1 } else { 0 }) | // fpu - vme | 1 << 3 | 1 << 4 | 1 << 5 | // vme, pse, tsc, msr + vme | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | // vme, pse, tsc, msr, pae 1 << 8 | 1 << 11 | 1 << 13 | 1 << 15 | // cx8, sep, pge, cmov 1 << 23 | 1 << 24 | 1 << 25 | 1 << 26; // mmx, fxsr, sse1, sse2 diff --git a/src/rust/cpu/memory.rs b/src/rust/cpu/memory.rs index 611f1565..867476ab 100644 --- a/src/rust/cpu/memory.rs +++ b/src/rust/cpu/memory.rs @@ -97,6 +97,19 @@ pub unsafe fn read_aligned32(addr: u32) -> i32 { }; } +pub unsafe fn read_aligned64(addr: u32) -> i64 { + dbg_assert!(addr < 0x40000000 as u32); + dbg_assert!(addr & 1 == 0); + if in_mapped_range(addr << 2) { + let lo = mmap_read32(addr << 2); + let hi = mmap_read32(addr + 1 << 2); + return lo as i64 | (hi as i64) << 32; + } + else { + return *(mem8 as *mut i64).offset((addr >> 1) as isize); + } +} + pub unsafe fn read128(addr: u32) -> reg128 { let mut value: reg128 = reg128 { i8_0: [0 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], diff --git a/tests/kvm-unit-tests/README.md b/tests/kvm-unit-tests/README.md index 1f4fe7b2..7bb7b074 100644 --- a/tests/kvm-unit-tests/README.md +++ b/tests/kvm-unit-tests/README.md @@ -12,6 +12,7 @@ make -C ../../build/libv86.js ./run.js x86/sieve.flat ./run.js x86/ioapic.flat ./run.js x86/apic.flat +./run.js x86/pae.flat ``` Tests can also be run in browser by going to `?profile=test-$name` (for diff --git a/tests/kvm-unit-tests/x86/Makefile.common b/tests/kvm-unit-tests/x86/Makefile.common index 73069d86..85bee533 100644 --- a/tests/kvm-unit-tests/x86/Makefile.common +++ b/tests/kvm-unit-tests/x86/Makefile.common @@ -51,6 +51,7 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \ $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \ $(TEST_DIR)/hyperv_synic.flat $(TEST_DIR)/hyperv_stimer.flat \ $(TEST_DIR)/hyperv_connections.flat \ + $(TEST_DIR)/pae.flat \ ifdef API tests-api = api/api-sample api/dirty-log api/dirty-log-perf diff --git a/tests/kvm-unit-tests/x86/pae.c b/tests/kvm-unit-tests/x86/pae.c new file mode 100644 index 00000000..b07219bf --- /dev/null +++ b/tests/kvm-unit-tests/x86/pae.c @@ -0,0 +1,101 @@ +/* Simple PAE paging test. See lib/x86/vm.c for similar code which sets up + * non-PAE paging. */ + +#include "fwcfg.h" +#include "asm/page.h" +#include "processor.h" + +#ifdef __x86_64__ +#error This test is 32-bit only. +#endif + +#define HUGE_PAGE_SIZE (1UL << 21) + +uint64_t pdpt[4] __attribute__((aligned(0x20))); +uint64_t page_dirs[4 * 512] __attribute__((aligned(0x1000))); +uint64_t page_tables[512 * 512] __attribute__((aligned(0x1000))); + +static bool is_pae_supported(void) { + struct cpuid c = cpuid(1); + return c.d & (1 << 6); +} + +/* Fill page directory at `pd` with huge page entries. */ +static void setup_pd_huge_pages(uint64_t *pd, uint64_t start, uint64_t end) { + uint64_t phys = start; + for (unsigned int i = 0; i < 512; i++) { + *pd++ = phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | + PT_PAGE_SIZE_MASK; + + phys += HUGE_PAGE_SIZE; + if (phys >= end) + return; + } +} + +/* Fill page directory at `pd` with page table entries, and use memory at `pt` + * to create page tables. */ +static void setup_pd(uint64_t *pd, uint64_t *pt, uint64_t start, uint64_t end) { + uint64_t phys = start; + for (unsigned int i = 0; i < 512; i++) { + *pd++ = (uint32_t)pt | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + for (unsigned int j = 0; j < 512; j++) { + *pt++ = phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + phys += PAGE_SIZE; + if (phys >= end) + return; + } + } +} + +static void setup_mmu(void) { + uint64_t mem_size = fwcfg_get_u64(FW_CFG_RAM_SIZE); + if (mem_size > (1ULL << 32)) + mem_size = 1ULL << 32; + + /* Map physical memory at 0000_0000 using huge pages */ + pdpt[0] = (uint32_t)&page_dirs[0 * 512] | PT_PRESENT_MASK; + setup_pd_huge_pages(&page_dirs[0 * 512], 0, mem_size); + + /* Map physical memory at 4000_0000 using huge pages */ + pdpt[1] = (uint32_t)&page_dirs[1 * 512] | PT_PRESENT_MASK; + setup_pd_huge_pages(&page_dirs[1 * 512], 0, mem_size); + + /* Map physical memory at 8000_0000 using huge pages */ + pdpt[2] = (uint32_t)&page_dirs[2 * 512] | PT_PRESENT_MASK; + setup_pd_huge_pages(&page_dirs[2 * 512], 0, mem_size); + + /* Map physical memory at C000_0000 using normal tables */ + pdpt[3] = (uint32_t)&page_dirs[3 * 512] | PT_PRESENT_MASK; + setup_pd(&page_dirs[3 * 512], &page_tables[0], 0, mem_size); + + write_cr0(0); + write_cr4(read_cr4() | X86_CR4_PAE); + write_cr3((uint32_t)pdpt); + write_cr0(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP); + + printf("paging enabled\n"); +} + +int main(void) +{ + if (!is_pae_supported()) { + printf("PAE not supported\n"); + return 1; + } + printf("PAE supported\n"); + setup_mmu(); + + volatile unsigned int test; + for (int i = 1; i < 4; i++) { + volatile unsigned int *ptr = (unsigned int*)((uint32_t)&test + (i << 30)); + printf("writing %u to %p, and reading from %p\n", i, ptr, &test); + *ptr = i; + if (test != i) { + printf("error, got %u\n", i); + return 1; + } + } + printf("everything OK\n"); + return 0; +}