v86/gen/generate_analyzer.js

#!/usr/bin/env node
"use strict";

const fs = require("fs");
const path = require("path");
const encodings = require("./x86_table");
const c_ast = require("./c_ast");
const { hex, mkdirpSync, get_switch_value, get_switch_exist, finalize_table } = require("./util");

const APPEND_NONFAULTING_FLAG = "analysis.flags |= JIT_INSTR_NONFAULTING_FLAG;";

const OUT_DIR = get_switch_value("--output-dir") || path.join(__dirname, "..", "build");

mkdirpSync(OUT_DIR);

const table_arg = get_switch_value("--table");
const gen_all = get_switch_exist("--all");
const to_generate = {
    analyzer: gen_all || table_arg === "analyzer",
    analyzer0f_16: gen_all || table_arg === "analyzer0f_16",
    analyzer0f_32: gen_all || table_arg === "analyzer0f_32",
};

console.assert(
    Object.keys(to_generate).some(k => to_generate[k]),
    "Pass --table [analyzer|analyzer0f_16|analyzer0f_32] or --all to pick which tables to generate"
);

gen_table();

function gen_read_imm_call(op, size_variant)
{
    let size = (op.os || op.opcode % 2 === 1) ? size_variant : 8;

    if(op.imm8 || op.imm8s || op.imm16 || op.imm1632 || op.imm32 || op.immaddr)
    {
        if(op.imm8)
        {
            return "read_imm8()";
        }
        else if(op.imm8s)
        {
            return "read_imm8s()";
        }
        else
        {
            if(op.immaddr)
            {
                // immaddr: depends on address size
                return "read_moffs()";
            }
            else
            {
                console.assert(op.imm1632 || op.imm16 || op.imm32);

                if(op.imm1632 && size === 16 || op.imm16)
                {
                    return "read_imm16()";
                }
                else
                {
                    console.assert(op.imm1632 && size === 32 || op.imm32);
                    return "read_imm32s()";
                }
            }
        }
    }
    else
    {
        return undefined;
    }
}

function gen_call(name, args)
{
    args = args || [];
    return `${name}(${args.join(", ")});`;
}

function gen_codegen_call(args)
{
    return args.map(arg => arg + ";");
}

function gen_codegen_call_modrm(args)
{
    args = args.map(arg => arg + ";");
    return [].concat(gen_call("modrm_skip", ["modrm_byte"]), args);
}

function gen_modrm_mem_reg_split(mem_args, reg_args, postfixes={})
{
    const { mem_postfix=[], reg_postfix=[] } = postfixes;

    return {
        type: "if-else",
        if_blocks: [{
            condition: "modrm_byte < 0xC0",
            body: []
                .concat(gen_codegen_call_modrm(mem_args))
                .concat(mem_postfix),
        }],
        else_block: {
            body: gen_codegen_call(reg_args).concat(reg_postfix),
        },
    };
}

/*
 * Current naming scheme:
 * instr(16|32|)_((66|F2|F3)?0F)?[0-9a-f]{2}(_[0-7])?(_mem|_reg|)
 */

function make_instruction_name(encoding, size, prefix_variant)
{
    const suffix = encoding.os ? String(size) : "";
    const opcode_hex = hex(encoding.opcode & 0xFF, 2);
    const prefix_0f = (encoding.opcode & 0xFF00) === 0x0F00 ? "0F" : "";
    const prefix = prefix_variant === undefined ? "" : hex(prefix_variant, 2);
    const fixed_g_suffix = encoding.fixed_g === undefined ? "" : `_${encoding.fixed_g}`;

    return `instr${suffix}_${prefix}${prefix_0f}${opcode_hex}${fixed_g_suffix}`;
}

function get_nonfaulting_mem_reg_postfix(encoding)
{
    const lea_special_case = encoding.opcode === 0x8D;
    const mem_postfix = (encoding.nonfaulting && lea_special_case) ? [APPEND_NONFAULTING_FLAG] : [];
    const reg_postfix = (encoding.nonfaulting && !lea_special_case) ? [APPEND_NONFAULTING_FLAG] : [];

    return {
        mem_postfix,
        reg_postfix,
    };
}

function gen_instruction_body(encodings, size)
{
    const encoding = encodings[0];

    let has_66 = false;
    let has_F2 = false;
    let has_F3 = false;

    for(let e of encodings)
    {
        if((e.opcode >>> 16) === 0x66) has_66 = true;
        if((e.opcode >>> 16) === 0xF2) has_F2 = true;
        if((e.opcode >>> 16) === 0xF3) has_F3 = true;
    }

    console.assert(
        !encodings.some(e => e.nonfaulting && e.block_boundary),
        "Unsupported: instruction cannot be both a jump and nonfaulting. Opcode: 0x" + hex(encoding.opcode)
    );

    if(has_66 || has_F2 || has_F3)
    {
        console.assert((encoding.opcode & 0xFF00) === 0x0F00);
    }

    const instruction_postfix = encoding.block_boundary ? ["analysis.flags |= JIT_INSTR_BLOCK_BOUNDARY_FLAG;"] : [];

    if(encoding.fixed_g !== undefined)
    {
        // instruction with modrm byte where the middle 3 bits encode the instruction

        // group by opcode without prefix plus middle bits of modrm byte
        let cases = encodings.reduce((cases_by_opcode, case_) => {
            console.assert(typeof case_.fixed_g === "number");
            cases_by_opcode[case_.opcode & 0xFFFF | case_.fixed_g << 16] = case_;
            return cases_by_opcode;
        }, Object.create(null));
        cases = Object.values(cases).sort((e1, e2) => e1.fixed_g - e2.fixed_g);

        return [
            "int32_t modrm_byte = read_imm8();",
            {
                type: "switch",
                condition: "modrm_byte >> 3 & 7",
                cases: cases.map(case_ => {
                    const fixed_g = case_.fixed_g;
                    const instruction_postfix = case_.block_boundary ? ["analysis.flags |=  JIT_INSTR_BLOCK_BOUNDARY_FLAG;"] : [];

                    const mem_args = [];
                    const reg_args = [];

                    const imm_read = gen_read_imm_call(case_, size);

                    if(imm_read)
                    {
                        mem_args.push(imm_read);
                        reg_args.push(imm_read);
                    }

                    if(has_66 || has_F2 || has_F3)
                    {
                        const if_blocks = [];

                        if(has_66) {
                            const name = make_instruction_name(case_, size, 0x66);
                            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
                            if_blocks.push({ condition: "prefixes_ & PREFIX_66", body, });
                        }
                        if(has_F2) {
                            const name = make_instruction_name(case_, size, 0xF2);
                            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
                            if_blocks.push({ condition: "prefixes_ & PREFIX_F2", body, });
                        }
                        if(has_F3) {
                            const name = make_instruction_name(case_, size, 0xF3);
                            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
                            if_blocks.push({ condition: "prefixes_ & PREFIX_F3", body, });
                        }

                        const else_block = {
                            body: [
                                gen_modrm_mem_reg_split(
                                    mem_args,
                                    reg_args,
                                    {}
                                )
                            ],
                        };

                        return {
                            conditions: [fixed_g],
                            body: [
                                "int32_t prefixes_ = *prefixes;",
                                {
                                    type: "if-else",
                                    if_blocks,
                                    else_block,
                                },
                            ].concat(instruction_postfix),
                        };
                    }
                    else
                    {
                        const body = [
                            gen_modrm_mem_reg_split(
                                mem_args,
                                reg_args,
                                get_nonfaulting_mem_reg_postfix(case_)
                            )
                        ].concat(instruction_postfix);

                        return {
                            conditions: [fixed_g],
                            body,
                        };
                    }
                }),

                default_case: {
                    body: [
                        "assert(false);",
                        "analysis.flags |= JIT_INSTR_BLOCK_BOUNDARY_FLAG;",
                    ],
                }
            },
        ].concat(instruction_postfix);
    }
    else if(has_66 || has_F2 || has_F3)
    {
        // instruction without modrm byte but with prefix

        console.assert(encoding.e);
        console.assert(!encoding.ignore_mod);

        const imm_read = gen_read_imm_call(encoding, size);

        const mem_args = [];
        const reg_args = [];

        if(imm_read)
        {
            mem_args.push(imm_read);
            reg_args.push(imm_read);
        }

        const if_blocks = [];

        if(has_66) {
            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
            if_blocks.push({ condition: "prefixes_ & PREFIX_66", body, });
        }
        if(has_F2) {
            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
            if_blocks.push({ condition: "prefixes_ & PREFIX_F2", body, });
        }
        if(has_F3) {
            const body = [gen_modrm_mem_reg_split(mem_args, reg_args, {})];
            if_blocks.push({ condition: "prefixes_ & PREFIX_F3", body, });
        }

        const else_block = {
            body: [
                gen_modrm_mem_reg_split(
                    mem_args,
                    reg_args,
                    {}
                )
            ],
        };

        return [
            "int32_t modrm_byte = read_imm8();",
            "int32_t prefixes_ = *prefixes;",
            {
                type: "if-else",
                if_blocks,
                else_block,
            }
        ].concat(instruction_postfix);
    }
    else if(encoding.fixed_g === undefined && encoding.e)
    {
        // instruction with modrm byte where the middle 3 bits encode a register

        console.assert(encodings.length === 1);

        const imm_read = gen_read_imm_call(encoding, size);

        if(encoding.ignore_mod)
        {
            console.assert(!imm_read, "Unexpected instruction (ignore mod with immediate value)");

            // Has modrm byte, but the 2 mod bits are ignored and both
            // operands are always registers (0f20-0f24)

            if(encoding.nonfaulting)
            {
                instruction_postfix.push(APPEND_NONFAULTING_FLAG);
            }

            return ["int32_t modrm_byte = read_imm8();"]
                .concat(gen_codegen_call([]))
                .concat(instruction_postfix);
        }
        else
        {
            const mem_args = [];
            const reg_args = [];

            if(imm_read)
            {
                mem_args.push(imm_read);
                reg_args.push(imm_read);
            }

            return [
                "int32_t modrm_byte = read_imm8();",
                gen_modrm_mem_reg_split(
                    mem_args,
                    reg_args,
                    get_nonfaulting_mem_reg_postfix(encoding)
                ),
            ].concat(instruction_postfix);
        }
    }
    else if(encoding.prefix)
    {
        console.assert(!encoding.nonfaulting, "Prefix/custom instructions cannot be marked as nonfaulting.");

        const instruction_name = make_instruction_name(encoding, size) + "_analyze";
        const imm_read = gen_read_imm_call(encoding, size);
        const args = [];

        if(imm_read)
        {
            args.push(imm_read);
        }

        const call_prefix = encoding.prefix ? "return " : "";
        // Prefix calls can add to the return flags
        return [call_prefix + gen_call(instruction_name, args)].concat(instruction_postfix);
    }
    else
    {
        // instruction without modrm byte or prefix

        const imm_read = gen_read_imm_call(encoding, size);

        const args = [];

        if(imm_read)
        {
            if(encoding.jump_offset_imm)
            {
                args.push("int32_t jump_offset = " + imm_read + ";");
                args.push(`
                    analysis.jump_target = is_osize_32() ?
                        *instruction_pointer + jump_offset :
                        get_seg_cs() + ((*instruction_pointer - get_seg_cs() + jump_offset) & 0xFFFF);`);
            }

            else
            {
                args.push(imm_read + ";");
            }
        }

        if(encoding.extra_imm16)
        {
            console.assert(imm_read);
            args.push(gen_call("read_imm16"));
        }
        else if(encoding.extra_imm8)
        {
            console.assert(imm_read);
            args.push(gen_call("read_imm8"));
        }

        if(encoding.nonfaulting)
        {
            instruction_postfix.push(APPEND_NONFAULTING_FLAG);
        }

        if(encoding.conditional_jump)
        {
            instruction_postfix.push("analysis.condition_index = " + (encoding.opcode & 0xF) + ";");
        }

        return args.concat(instruction_postfix);
    }
}

function gen_table()
{
    let by_opcode = Object.create(null);
    let by_opcode0f = Object.create(null);

    for(let o of encodings)
    {
        let opcode = o.opcode;

        if(opcode >= 0x100)
        {
            if((opcode & 0xFF00) === 0x0F00)
            {
                opcode &= 0xFF;
                by_opcode0f[opcode] = by_opcode0f[opcode] || [];
                by_opcode0f[opcode].push(o);
            }
        }
        else
        {
            by_opcode[opcode] = by_opcode[opcode] || [];
            by_opcode[opcode].push(o);
        }
    }

    let cases = [];
    for(let opcode = 0; opcode < 0x100; opcode++)
    {
        let encoding = by_opcode[opcode];
        console.assert(encoding && encoding.length);

        let opcode_hex = hex(opcode, 2);

        if(encoding[0].os)
        {
            cases.push({
                conditions: [`0x${opcode_hex}`],
                body: gen_instruction_body(encoding, 16),
            });
            cases.push({
                conditions: [`0x${opcode_hex}|0x100`],
                body: gen_instruction_body(encoding, 32),
            });
        }
        else
        {
            cases.push({
                conditions: [`0x${opcode_hex}`, `0x${opcode_hex}|0x100`],
                body: gen_instruction_body(encoding, undefined),
            });
        }
    }
    const table = {
        type: "switch",
        condition: "opcode",
        cases,
        default_case: {
            body: ["assert(false);"]
        },
    };

    if(to_generate.analyzer)
    {
        finalize_table(
            OUT_DIR,
            "analyzer",
            c_ast.print_syntax_tree([table]).join("\n") + "\n"
        );
    }

    const cases0f_16 = [];
    const cases0f_32 = [];
    for(let opcode = 0; opcode < 0x100; opcode++)
    {
        let encoding = by_opcode0f[opcode];

        if(!encoding)
        {
            encoding = [
                {
                    opcode: 0x0F00 | opcode,
                },
            ];
        }

        console.assert(encoding && encoding.length);

        let opcode_hex = hex(opcode, 2);

        if(encoding[0].os)
        {
            cases0f_16.push({
                conditions: [`0x${opcode_hex}`],
                body: gen_instruction_body(encoding, 16),
            });
            cases0f_32.push({
                conditions: [`0x${opcode_hex}`],
                body: gen_instruction_body(encoding, 32),
            });
        }
        else
        {
            let block = {
                conditions: [`0x${opcode_hex}`],
                body: gen_instruction_body(encoding, undefined),
            };
            cases0f_16.push(block);
            cases0f_32.push(block);
        }
    }

    const table0f_16 = {
        type: "switch",
        condition: "opcode",
        cases: cases0f_16,
        default_case: {
            body: ["assert(false);"]
        },
    };
    const table0f_32 = {
        type: "switch",
        condition: "opcode",
        cases: cases0f_32,
        default_case: {
            body: ["assert(false);"]
        },
    };

    if(to_generate.analyzer0f_16)
    {
        finalize_table(
            OUT_DIR,
            "analyzer0f_16",
            c_ast.print_syntax_tree([table0f_16]).join("\n") + "\n"
        );
    }

    if(to_generate.analyzer0f_32)
    {
        finalize_table(
            OUT_DIR,
            "analyzer0f_32",
            c_ast.print_syntax_tree([table0f_32]).join("\n") + "\n"
        );
    }
}