From a12bdafde42f426252349a8f8677d410fc29b510 Mon Sep 17 00:00:00 2001 From: Sami Samhuri Date: Thu, 18 Jun 2026 06:42:12 -0700 Subject: [PATCH] WIP: re-organize into lib/ dir --- bin/compile | 7 + lib/compiler.rb | 116 +++ lib/compiler/asm/arch.rb | 67 ++ lib/compiler/asm/assembler.rb | 82 ++ lib/compiler/asm/binary_assembler.rb | 322 ++++++++ lib/compiler/asm/constant_proxy.rb | 13 + lib/compiler/asm/cstruct.rb | 342 ++++++++ lib/compiler/asm/elf.rb | 10 + lib/compiler/asm/elf/elfsymtab.rb | 7 + lib/compiler/asm/elf/elfwriter.rb | 9 + lib/compiler/asm/macho.rb | 10 + lib/compiler/asm/macho/load_commands.rb | 61 ++ lib/compiler/asm/macho/mach_header.rb | 46 ++ lib/compiler/asm/macho/nlist.rb | 50 ++ lib/compiler/asm/macho/object_file.rb | 373 +++++++++ lib/compiler/asm/macho/relocation_info.rb | 35 + lib/compiler/asm/macho/section.rb | 34 + lib/compiler/asm/macho/structs.rb | 53 ++ lib/compiler/asm/macho/symbol.rb | 31 + lib/compiler/asm/macho/symbol_table.rb | 88 ++ lib/compiler/asm/object_file.rb | 28 + lib/compiler/asm/register_proxy.rb | 70 ++ lib/compiler/asm/symbol_table.rb | 99 +++ lib/compiler/asm/text_assembler.rb | 73 ++ lib/compiler/asm/variable_proxy.rb | 43 + lib/compiler/asm/x86/arch.rb | 42 + lib/compiler/asm/x86/binary_assembler.rb | 866 +++++++++++++++++++ lib/compiler/asm/x86/registers.rb | 32 + lib/compiler/asm/x86/template.darwin.asm | 11 + lib/compiler/asm/x86/template.linux.asm | 13 + lib/compiler/asm/x86/text_assembler.rb | 159 ++++ lib/compiler/build.rb | 108 +++ lib/compiler/parse_error.rb | 14 + lib/compiler/parser.rb | 966 ++++++++++++++++++++++ 34 files changed, 4280 insertions(+) create mode 100755 bin/compile create mode 100644 lib/compiler.rb create mode 100644 lib/compiler/asm/arch.rb create mode 100644 lib/compiler/asm/assembler.rb create mode 100644 lib/compiler/asm/binary_assembler.rb create mode 100644 lib/compiler/asm/constant_proxy.rb create mode 100644 lib/compiler/asm/cstruct.rb create mode 100644 lib/compiler/asm/elf.rb create mode 100644 lib/compiler/asm/elf/elfsymtab.rb create mode 100644 lib/compiler/asm/elf/elfwriter.rb create mode 100644 lib/compiler/asm/macho.rb create mode 100644 lib/compiler/asm/macho/load_commands.rb create mode 100644 lib/compiler/asm/macho/mach_header.rb create mode 100644 lib/compiler/asm/macho/nlist.rb create mode 100644 lib/compiler/asm/macho/object_file.rb create mode 100644 lib/compiler/asm/macho/relocation_info.rb create mode 100644 lib/compiler/asm/macho/section.rb create mode 100644 lib/compiler/asm/macho/structs.rb create mode 100644 lib/compiler/asm/macho/symbol.rb create mode 100644 lib/compiler/asm/macho/symbol_table.rb create mode 100644 lib/compiler/asm/object_file.rb create mode 100644 lib/compiler/asm/register_proxy.rb create mode 100644 lib/compiler/asm/symbol_table.rb create mode 100644 lib/compiler/asm/text_assembler.rb create mode 100644 lib/compiler/asm/variable_proxy.rb create mode 100644 lib/compiler/asm/x86/arch.rb create mode 100644 lib/compiler/asm/x86/binary_assembler.rb create mode 100644 lib/compiler/asm/x86/registers.rb create mode 100644 lib/compiler/asm/x86/template.darwin.asm create mode 100644 lib/compiler/asm/x86/template.linux.asm create mode 100644 lib/compiler/asm/x86/text_assembler.rb create mode 100755 lib/compiler/build.rb create mode 100644 lib/compiler/parse_error.rb create mode 100644 lib/compiler/parser.rb diff --git a/bin/compile b/bin/compile new file mode 100755 index 0000000..aecb64e --- /dev/null +++ b/bin/compile @@ -0,0 +1,7 @@ +#!/usr/bin/env ruby + +$LOAD_PATH.unshift('../lib') + +require 'compiler' + +??? diff --git a/lib/compiler.rb b/lib/compiler.rb new file mode 100644 index 0000000..c4d7042 --- /dev/null +++ b/lib/compiler.rb @@ -0,0 +1,116 @@ +this_dir = File.dirname(__FILE__) +Dir.chdir(File.expand_path('..', this_dir)) +$LOAD_PATH.unshift(this_dir) unless $LOAD_PATH.include?(this_dir) + +require 'compiler/parser' + +class Compiler + + attr_reader :platform, :arch_name, :format, :binformat + + attr_reader :arch, :asm, :symbol_table_factory, :object_file_factor + + # platform [String] "linux" or "darwin" + # arch_name [String] "x86" or "arm" + # format [String] "text" or "bin" + # binform [String, nil] "elf" or "macho", only used when format is "bin" + def initialize(platform, arch_name, format, binformat = nil) + @platform = platform + @arch_name = arch_name + @format = format + @binformat = binformat + wire + end + + def compile(input) + parser = Parser.new(input, asm) + parser.parse + parser.compile + end + + + ####### + private + ####### + + def wire + if format == 'bin' + case binformat + + when 'elf' + wire_elf + + when 'macho' + wire_macho + + else + raise "unsupported binary format: #{binformat}" + end + end + + case @arch_name + + when 'x86' + wire_x86 + + when 'arm' + wire_arm + + else + raise "unsupported arch: #{arch}" + end + end + + def wire_elf + require 'compiler/asm/elf/object_file' + require 'compiler/asm/elf/symbol_table' + + @symbol_table_factory = ASM::ELF::SymbolTable + @object_file_factory = ASM::ELF::ObjectFile + end + + def wire_macho + require 'compiler/asm/macho/object_file' + require 'compiler/asm/macho/symbol_table' + + @symbol_table_factory = ASM::MachO::SymbolTable + @object_file_factory = ASM::MachO::ObjectFile + end + + def wire_arm + require 'compiler/asm/arm/binary_assembler' + require 'compiler/asm/arm/text_assembler' + + @arch = ASM::ARM::Arch.instance + @asm = + case format + when 'text' + ASM::ARM::TextAssembler.new(self) + + when 'bin' + ASM::ARM::BinaryAssembler.new(self) + + else + raise "unsupported output format: #{format}" + end + end + + def wire_x86 + require 'compiler/asm/x86/binary_assembler' + require 'compiler/asm/x86/text_assembler' + + @arch = ASM::X86::Arch.instance + @asm = + case format + when 'text' + ASM::X86::TextAssembler.new(self) + + when 'bin' + ASM::X86::BinaryAssembler.new(self) + + else + raise "unsupported output format: #{format}" + end + end + +end diff --git a/lib/compiler/asm/arch.rb b/lib/compiler/asm/arch.rb new file mode 100644 index 0000000..bde51b4 --- /dev/null +++ b/lib/compiler/asm/arch.rb @@ -0,0 +1,67 @@ +class Compiler + module ASM + + class Arch + + attr_reader :bits, :word_bits + attr_reader :preamble, :postamble + attr_reader :endianness + + # config: + # - bits: native register / pointer size + # - word_bits: number of bits in a word + # - endianness: "big" or "little" + # - preamble: binary preamble + # - postamble: binary postamble + def initialize(config) + @bits = config['bits'] + @word_bits = config['word_bits'] + @endianness = config['endianness'] + @preamble = config['preamble'] + @postamble = config['postamble'] + end + + def bytes + bits / 8 + end + + def word_bytes + word_bits / 8 + end + + def big_endian? + endianness == 'big' + end + + def little_endian? + endianness == 'little' + end + + def pointer_bytes + bytes + end + + def min_signed + -1 * 2 ** (bits - 1) + end + + def max_signed + 2 ** (bits - 1) - 1 + end + + def min_unsigned + 0 + end + + def max_unsigned + 2 ** bits - 1 + end + + def signed_int + @signed_int ||= min_signed..max_signed + end + + end + + end +end diff --git a/lib/compiler/asm/assembler.rb b/lib/compiler/asm/assembler.rb new file mode 100644 index 0000000..12c5ee9 --- /dev/null +++ b/lib/compiler/asm/assembler.rb @@ -0,0 +1,82 @@ +# sjs +# may 2009 + +class Compiler + module ASM + + # Abstract class for common functionality between different code + # generators. Also defines somewhat of an interface that must be + # implemented to be useful. + class Assembler + + def initialize(delegate) + @delegate = delegate + end + + def arch + delegate.arch + end + + def block(*args, &block) + instance_eval(&block) + end + + def load(n) + end + + def load_var(name) + end + + def store_var(name, reg) + end + + def neg(reg) + end + + def stack_add(reg) + end + + def stack_sub(reg) + end + + def stack_mul_signed(reg) + end + + def stack_div(reg) + end + + def stack_or(reg) + end + + def stack_xor(reg) + end + + def stack_and(reg) + end + + def not_(reg) + end + alias_method :not, :not_ + + def compare(reg, n) + end + + def je(label) + end + + def jne(label) + end + + def jmp(label) + end + + def mov_reg_imm(reg, n) + end + + def call(label) + end + + end + + end +end diff --git a/lib/compiler/asm/binary_assembler.rb b/lib/compiler/asm/binary_assembler.rb new file mode 100644 index 0000000..0f30cb7 --- /dev/null +++ b/lib/compiler/asm/binary_assembler.rb @@ -0,0 +1,322 @@ +require 'compiler/asm/assembler' +require 'compiler/asm/constant_proxy' +require 'compiler/asm/variable_proxy' + +class Compiler + module ASM + + class BinaryAssembler < Assembler + + DEBUG_OUTPUT = false + + attr_reader :ip + + def initialize(delegate) + super(delegate) + + @symtab = delegate.symbol_table_factory.new + + # Almost a byte array, except for addresses. + # + # Addresses take the form [:, ] + # where is one of: var, const, or label + # + # NOTE the type is redundant because of VariableProxy#const? + # and labels are just strings. + # + # however, we could accept strings for variable names + # if we keep the type tag. something to think about. + @ir = [] + + # Our instruction pointer, or the number of bytes written. + @ip = 0 + + # Map locations in the byte array to var proxies so we can + # resolve address operations on the 2nd pass. + @proxies = {} + + emit_entry_point + emit_preamble + end + + # register for return values + def return_reg + raise 'subclasses must override #return_reg' + end + + def emit_entry_point + end + + def emit_preamble + arch.preamble[delegate.platform].each { |byte| emit_byte(byte) } + end + + def emit_postamble + arch.postamble[delegate.platform].each { |byte| emit_byte(byte) } + end + + def output + emit_postamble + + byte_array = resolve_labels + + #puts "1st pass: " + byte_array.inspect if DEBUG_OUTPUT + + binary = package(byte_array) + + @symtab.calculate_offsets(binary.length) + if DEBUG_OUTPUT + puts ">>> text offset: 0x#{@symtab.text_offset.to_s(16)}" + puts ">>> const offset: 0x#{@symtab.const_offset.to_s(16)}" + puts ">>> bss offset: 0x#{@symtab.bss_offset.to_s(16)}" + end + + # Now that we know where everything lies do the 2nd pass + # calculating and filling in final var and const addresses. + # + # outline: + # - resolve all variable proxies in @proxies replacing + # the placeholder bytes (0xff) with the real address + + bss_offset = @symtab.bss_offset + const_offset = @symtab.const_offset + @proxies.each do |i, proxy| + #puts ">>> Resolving #{proxy.name}" if DEBUG_OUTPUT + var = @symtab.var(proxy.name) + base_addr = if proxy.const? + const_offset + @symtab.const(proxy.name) + else + bss_offset + @symtab.var(proxy.name) + end + #puts ">>> Replacing #{byte_array[i,4].map{|x|'0x' + x.to_s(16)}.inspect} with #{num_to_quad(proxy.resolve(base_addr)).map{|x|'0x' + x.to_s(16)}.inspect}" if DEBUG_OUTPUT + byte_array[i, arch.pointer_size] = num_to_quad(proxy.resolve(base_addr)) + end + + binary = package(byte_array) + + #puts "2nd pass: " + byte_array.inspect if DEBUG_OUTPUT + + objwriter = delegate.object_file_factory.new + objwriter.text(binary) + objwriter.const(@symtab.const_data) if @symtab.const_size > 0 + objwriter.bss(@symtab.bss_size) if @symtab.bss_size > 0 + objwriter.reloc(@symtab.reloc_info) + objwriter.symtab(@symtab) + objwriter.serialize + end + + def resolve_labels + bytes_read = 0 + bytes = [] + @ir.each_with_index do |x, i| + if x.is_a?(Numeric) + bytes << x + bytes_read += 1 + + elsif addr?(x) + # remember this so we can replace the bogus addr later + @proxies[bytes_read] = x[1] + + # add a relocation entry for this address + @symtab.reloc(bytes_read) + + # fill in said bogus addr + bogus_addr = [0xff] * arch.pointer_size + bytes += bogus_addr + bytes_read += bogus_addr.length + + + # TODO find out if we should calculate addrs as offsets rather than + # absolute as they are done now. (ok for Mach-O, maybe not ELF) + elsif label?(x) + # the actual eip points to the next instruction already, so should we. + real_ip = bytes_read + arch.bytes + name = x[1] + addr = @symtab.lookup_label(name) - real_ip # dest - src to get relative addr + #puts "resolved label: #{x} = 0x#{@symtab.lookup_label(name).to_s(16)} (rel: 0x#{addr.to_s(16)}, ip = 0x#{real_ip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT + + + addr_bytes = addr_to_bytes(addr) + bytes += addr_bytes + bytes_read += addr_bytes.length + + else + raise "unknown value in the IR at #{bytes_read} - #{x.inspect}" + end + end + + return bytes + end + + def package(bytes) + bytes.pack('c*') + end + + # Silly semantics, but labels don't count as an address since they + # don't need to be deferred. + def addr?(x) + x.is_a?(Array) && [:var, :const].include?(x[0]) + end + + def label?(x) + x.is_a?(Array) && x[0] == :label + end + + # XXX this should probably evaluate the value somehow + def define_const(name, bytes, value) + @symtab.define_const(name, bytes, value) + return const(name) + end + + # Define a variable with the given name and size in bytes. + def define_var(name, bytes = arch.word_bytes) + unless @symtab.var?(name) + @symtab.define_var(name, bytes) + else + STDERR.puts "[warning] attempted to redefine #{name}" + end + return var(name) + end + + def var(name) + STDERR.puts "[error] undefined variable #{name}" unless var?(name) + VariableProxy.new(name) + end + + def const(name) + STDERR.puts "[error] undefined constant #{name}" unless const?(name) + ConstantProxy.new(name) + end + + def var?(name) + @symtab.var?(name) + end + + def const?(name) + @symtab.const?(name) + end + + # Define a variable unless it exists. + def var!(name, bytes = arch.word_bytes) + if var?(name) + var(name) + else + define_var(name, bytes) + end + end + + # Count the bytes that were encoded in the given block. + def asm + # stash the current number of bytes written + instruction_offset = @ip + + print "0x#{@ip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT + + yield + + # return the number of bytes written + @ip - instruction_offset + + puts if DEBUG_OUTPUT + end + + + def emit_byte(byte) + + ##### The joke's on me! Array#pack('c*') already does this. It is nice to see + # in the debugging output though, so it stays for now. + # + # Convert negative native ints into signed bytes. + # + # Calculate the signed byte as the difference between -1 (0xff) and some + # number, X. When byte == -1 we want X == 0, so X == -byte - 1. + # Since -byte == ~byte + 1, then -byte - 1 == ~byte + 1 - 1 == ~byte, + # and X == ~byte. We want the *signed byte* -1, so we use 0xff, + # *not* -1. Ruby sees our signed bytes as positive ints 0-255. + # + byte = 0xff - ~byte if byte < 0 && byte >= -128 + + # make sure it's a byte + raise "not a byte: #{byte.inspect}" unless byte == byte & 0xff + + byte = byte & 0xff + ### end of pointless code + + print (byte >= 0 && byte < 0x10 ? '0' : '') + byte.to_s(16) + ' ' if DEBUG_OUTPUT + + @ir << byte + @ip += 1 + end + + # addresses are emited as arrays of bytes, prefixed with :var, :const, or :label + def emit_addr(type, name) + placeholder = [type, name] + puts placeholder.inspect if DEBUG_OUTPUT + @ir << placeholder + + # addresses are a constant size + @ip += arch.pointer_bytes + end + + def emit_var(name_or_proxy) + proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : var(name_or_proxy) + emit_addr(:var, proxy) + end + + def emit_const(name) + proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : const(name_or_proxy) + emit_addr(:const, proxy) + end + + def emit_label(name) + print "<#{name}> " if DEBUG_OUTPUT + emit_addr(:label, name) + end + + def make_label(suffix = nil) + @symtab.unique_label(suffix) + end + + def define_label(name) + puts "\n#{name} (0x#{@ip.to_s(16)}):" if DEBUG_OUTPUT + @symtab.define_label(name, @ip) + end + + def addr_to_bytes + if big_endian? + num_to_big_endian + elsif little_endian? + num_to_little_endian + else + raise 'oops' + end + end + + # Convert a number to an array of bytes, discarding excess bits. + def num_to_big_endian(num) + case arch.pointer_size + when 4 + [ + # high + (num >> 16) & 0xff, + (num >> 24) & 0xff, + + # low + num & 0xff, + (num >> 8) & 0xff + ] + else + raise 'unimplemented' + end + end + + # Convert a number to an array of bytes, discarding excess bits. + def num_to_little_endian(num) + bytes = num_to_big_endian + bytes.each_slice(2).to_a.reverse.flatten + end + + end + + end +end diff --git a/lib/compiler/asm/constant_proxy.rb b/lib/compiler/asm/constant_proxy.rb new file mode 100644 index 0000000..53c5127 --- /dev/null +++ b/lib/compiler/asm/constant_proxy.rb @@ -0,0 +1,13 @@ +class Compiler + module ASM + + class ConstantProxy < VariableProxy + + def const? + true + end + + end + + end +end diff --git a/lib/compiler/asm/cstruct.rb b/lib/compiler/asm/cstruct.rb new file mode 100644 index 0000000..6b2b5d1 --- /dev/null +++ b/lib/compiler/asm/cstruct.rb @@ -0,0 +1,342 @@ +# Struct does some trickery with custom allocators so we can't +# subclass it without writing C. Instead we define a CStruct class +# that does something similar enough for our purpose. It is +# subclassed just like any other class. A nice side-effect of this +# syntax is that it is always clear that a CStruct is just a class and +# instances of the struct are objects. +# +# Some light metaprogramming is used to make the following syntax possible: +# +# class MachHeader < CStruct +# uint :magic +# int :cputype +# int :cpusubtype +# ... +# int :flags +# end +# +# Inheritance works as you would expect. +# +# class LoadCommand < CStruct +# uint32 :cmd +# uint32 :cmdsize +# end +# +# # inherits cmd and cmdsize as the first 2 fields +# class SegmentCommand < LoadCommand +# string :segname, 16 +# uint32 :vmaddr +# uint32 +# end +# +# Nothing tricky or confusing there. Members of a CStruct class are +# declared in the class definition. A different definition using a +# more static approach probably wouldn't be very hard... if +# performance is critical ... but then why are you using Ruby? ;-) +# +# +# TODO support bit fields +# +# Bit fields should be supported by passing the number of bits a field +# should occupy. Perhaps we could use the size 'pack' for the rest of +# the field. +# +# class RelocationInfo < CStruct +# int32 :address +# uint32 :symbolnum, 24 +# pack :pcrel, 1 +# pack :length, 2 +# pack :extern, 1 +# pack :type, 4 +# end + +class CStruct + + + ################### + # Class Constants # + ################### + + # Size in bytes. + SIZE_MAP = { + :int8 => 1, + :uint8 => 1, + :int16 => 2, + :uint16 => 2, + :int32 => 4, + :uint32 => 4, + :string => lambda { |*opts| opts.first }, # first opt is size + # the last 3 are to make the language more C-like + :int => 4, + :uint => 4, + :char => 1 + } + + # 32-bit + PACK_MAP = { + :int8 => 'c', + :uint8 => 'C', + :int16 => 's', + :uint16 => 'S', + :int32 => 'i', + :uint32 => 'I', + :string => lambda do |str, *opts| + len = opts.first + str.ljust(len, "\0")[0, len] + end, + # a few C-like names + :int => 'i', + :uint => 'I', + :char => 'C' + } + + # Only needed when unpacking is different from packing, i.e. strings w/ lambdas in PACK_MAP. + UNPACK_MAP = { + :string => lambda do |str, *opts| + len = opts.first + val = str[0, len-1].sub(/\0*$/, '') + str.slice!((len-1)..-1) + val + end + } + + ########################## + # Class Instance Methods # + ########################## + + # Note: const_get and const_set are used so the constants are bound + # at runtime, to the real class that has subclassed CStruct. + # I figured Ruby would do this but I haven't looked at the + # implementation of constants so it might be tricky. + # + # All of this could probably be avoided with Ruby 1.9 and + # private class variables. That is definitely something to + # experiment with. + + class <, +# , and . + +class Compiler + module MachO + + class LoadCommand < CStruct + uint32 :cmd + uint32 :cmdsize + end + + # Values for the cmd member of LoadCommand CStructs (incomplete!). + LC_SEGMENT = 0x1 + LC_SYMTAB = 0x2 + LC_SYMSEG = 0x3 + LC_THREAD = 0x4 + LC_UNIXTHREAD = 0x5 + + class SegmentCommand < LoadCommand + string :segname, 16 + uint32 :vmaddr + uint32 :vmsize + uint32 :fileoff + uint32 :filesize + int32 :maxprot + int32 :initprot + uint32 :nsects + uint32 :flags + end + + + # Values for protection fields, maxprot and initprot. + VM_PROT_NONE = 0x00 + VM_PROT_READ = 0x01 + VM_PROT_WRITE = 0x02 + VM_PROT_EXECUTE = 0x04 + VM_PROT_NO_CHANGE = 0x08 + VM_PROT_COPY = 0x10 + + + class SymbolTableCommand < LoadCommand + uint32 :symoff # Points to an array of Nlist structs. + uint32 :nsyms # Number of entries in said array. + uint32 :stroff # Offset of the string table. + uint32 :strsize # Size of the string table in bytes. + end + + + LOAD_COMMAND_STRUCT_MAP = { + LC_SEGMENT => SegmentCommand, + LC_SYMTAB => SymbolTableCommand + } + + end +end diff --git a/lib/compiler/asm/macho/mach_header.rb b/lib/compiler/asm/macho/mach_header.rb new file mode 100644 index 0000000..72100b6 --- /dev/null +++ b/lib/compiler/asm/macho/mach_header.rb @@ -0,0 +1,46 @@ +require 'compiler/cstruct' + +# The MachO module contains constants and structures related to the +# Mach Object format (Mach-O). They are relevant to Darwin on OS X. +# +# Constants and structures as defined in /usr/include/mach-o/loader.h +# on Mac OS X Leopard (10.5.7). Also see , +# , and . + +class Compiler + module MachO + + # Appears at the beginning of every Mach object file. + class MachHeader < CStruct + uint32 :magic + int32 :cputype + int32 :cpusubtype + uint32 :filetype + uint32 :ncmds + uint32 :sizeofcmds + uint32 :flags + end + + # Values for the magic field. + MH_MAGIC = 0xfeedface # Mach magic number (big-endian). + MH_CIGAM = 0xcefaedfe # Little-endian version. + + # Values for the filetype field. + MH_OBJECT = 0x1 + MH_EXECUTE = 0x2 + MH_FVMLIB = 0x3 + MH_CORE = 0x4 + MH_PRELOAD = 0x5 + MH_DYLIB = 0x6 + MH_DYLINKER = 0x7 + MH_BUNDLE = 0x8 + MH_DYLIB_STUB = 0x9 + MH_DSYM = 0xa + + # CPU types and subtypes (only Intel for now). + CPU_TYPE_X86 = 7 + CPU_TYPE_I386 = CPU_TYPE_X86 + CPU_SUBTYPE_X86_ALL = 3 + + end +end diff --git a/lib/compiler/asm/macho/nlist.rb b/lib/compiler/asm/macho/nlist.rb new file mode 100644 index 0000000..58fd0e8 --- /dev/null +++ b/lib/compiler/asm/macho/nlist.rb @@ -0,0 +1,50 @@ +require 'compiler/cstruct' + +# The MachO module contains constants and structures related to the +# Mach Object format (Mach-O). They are relevant to Darwin on OS X. +# +# Constants and structures as defined in /usr/include/mach-o/loader.h +# on Mac OS X Leopard (10.5.7). Also see , +# , and . + +class Compiler + module MachO + + ######################## + # Symbol table support # + ######################## + + # Nlist is used to describe symbols. + class Nlist < CStruct + uint32 :n_strx # Index into string table. Index of zero is the empty string. + uint8 :n_type # Type flag (see below). + uint8 :n_sect # Section number (from 1) or NO_SECT. + uint16 :n_desc # TODO See . + uint32 :n_value # The symbol's value (or stab offset). + end + + # Type flag (see for more details) + # --------- + # + # This field consists of four bitfields: + # + # uchar N_STAB : 3 + # uchar N_PEXT : 1 + # uchar N_TYPE : 3 + # uchar N_EXT : 1 + # + N_STAB = 0xe0 # if any bits set => symbolic debugging info + N_PEXT = 0x10 # private external symbol bit + N_TYPE = 0x0e # mask for the type bits + N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals) + + # Values for N_TYPE. (incomplete!) + N_UNDF = 0x0 # undefined, n_sect == NO_SECT + N_ABS = 0x2 # absolute, n_sect == NO_SECT + N_SECT = 0xe # defined in section number n_sect + + NO_SECT = 0 + MAX_SECT = 255 + + end +end diff --git a/lib/compiler/asm/macho/object_file.rb b/lib/compiler/asm/macho/object_file.rb new file mode 100644 index 0000000..c3e99be --- /dev/null +++ b/lib/compiler/asm/macho/object_file.rb @@ -0,0 +1,373 @@ +require 'asm/macho' + +class Compiler + module MachO + + class ObjectFile + + attr_accessor :header, :load_commands, :sections, :data + attr_accessor :current_segment + + def initialize(filetype = MH_OBJECT) + @header = MachHeader.new(MH_MAGIC, CPU_TYPE_X86, CPU_SUBTYPE_X86_ALL, filetype, 0, 0, 0) + @load_commands = [] # All defined segments. + @sections = {} # Map of segment names to lists of sections. + @section_disk_size = Hash.new(0) # Sections store their VM size so we need their sizes on disk. + @section_offset = 0 # Offset of the next section's data, in bytes. + @data = [] # Blobs of data that appear at the end of the file. + # (text, data, relocation info, symtab, ...) + @current_segment = nil # An alias for the last defined segment. + @text_segname = nil # Name of __TEXT segement + @text_sect_index = nil # Index of __text section + @text_data_index = nil # Index into @data of __text section data + @reloc_info = nil # Copy of relocation info array + end + + + # Define a LoadCommand in this file. The header's ncmds and sizeofcmds + # fields are updated automatically to keep things in sync. If a block is + # given it is passed the new LoadCommand struct after all other + # initialization has been done. + # + # Other methods that create any type of load command should use this + # method to do so. Right now the only types supported are LC_SEGMENT + # and LC_SYMTAB. Modify asm/macho.rb to add structs for other types, and + # add them to LOAD_COMMAND_STRUCT_MAP. + + def load_command(cmdtype) + struct = LOAD_COMMAND_STRUCT_MAP[cmdtype] + unless struct + raise "unsupported load command type: #{cmdtype.inspect}," + + " supported types: #{LOAD_COMMAND_STRUCT_MAP.keys.sort.inspect}" + end + + # Fill in all the unknown fields with 0, this is nonsense for + # string fields but that doesn't really matter. + dummy_vals = [0] * (struct::Members.size - 2) + + # cmd cmdsize ... + command = struct.new(cmdtype, struct.bytesize, *dummy_vals) + + @load_commands << command + + @header[:ncmds] += 1 + @header[:sizeofcmds] += command.bytesize + + yield(command) if block_given? + + return command + end + + + # Define a segment in this file. If a block is given it is passed + # the new segment. You can chain calls to segment, it returns self. + # + # Mach object files should only contain one anonymous segment. This + # is not checked but should be kept in mind when crafting files. + def segment(name, &block) + @current_segment = load_command(LC_SEGMENT) do |seg| + seg[:segname] = name + block.call(seg) if block + end + return self + end + + + # Define a section under the given segment. nsects and cmdsize are + # updated automatically. segname can't be derived from the segment + # that this section is defined under, as they can differ. + # + # Mach object files have the __text, __data, and other common + # sections all defined under one anonymous segment, but their segment + # names reflect their final positions after linking. The linker plonks + # them in the segment that they name. + def section(name, segname, data = '', vmsize=data.size, + segment = @current_segment, type = S_REGULAR) + + # Create the new section. + section = Section.new(name, segname, @section_offset, vmsize, 0, 0, 0, 0, 0, 0, type) + + # Add this section to the map of segment names to sections. + (@sections[segment[:segname]] ||= []) << section + @section_disk_size[name] = data.size + @section_offset += data.size + @data << data if data.size > 0 + + # Update the header. + @header[:sizeofcmds] += section.bytesize + + # Update the segment. + segment[:nsects] += 1 + segment[:cmdsize] += section.bytesize + + yield(section) if block_given? + + return section + end + + + + # Define a standard text section under the current segment (if present). + # + # If there is no current segment then we act according to the file's type + # (specified in the header). Segments are created if they do not exist. + # + # When it is MH_OBJECT the text section is defined under a single, + # nameless segment, but the section's segment name is set to the name + # given here. + # + # For MH_EXECUTE files the text section goes under the segment with the + # name given (__TEXT). + + def text(data, sectname = '__text', segname='__TEXT') + real_segname = nil + unless @current_segment + real_segname = segname_based_on_filetype(segname) + segment(real_segname) do |seg| + seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE + seg[:initprot] = VM_PROT_READ | VM_PROT_EXECUTE + end + end + + section(sectname, segname, data) do |sect| + # reloff and nreloc are calculated later (in calculate_offsets) + sect[:flags] = 0x400 # S_ATTR_SOME_INSTRUCTIONS + end + + # Remember where section and data are so we can update them later. + @text_segname = real_segname || segname + @text_sect_index = @sections[@text_segname].length-1 + @text_data_index = @data.length-1 + + return self + end + + def update_text(data) + raise 'no __text segment defined yet' unless @text_data_index + @data[@text_data_index] = data + end + + # Basis for #data, #const, and #bss methods. + def segment_based_on_filetype(segname, options = {}) + unless @current_segment + permissions = VM_PROT_READ + permisions |= VM_PROT_WRITE if options.delete(:writable) + segment(segname_based_on_filetype(segname)) do |seg| + seg[:initprot] = seg[:maxprot] = permissions + end + end + yield if block_given? + return self + end + + # Define a standard data section under the current segment (if present). + # This behaves similarly to the text method. + # + def data(data, sectname = '__data', segname='__DATA') + segment_based_on_filetype(segname, :writable => true) do + section(sectname, segname, data) + end + end + + # Define a standard const section under the current segment (if present). + # This behaves similarly to the data method. + # + def const(data, sectname = '__const', segname='__DATA') + segment_based_on_filetype(segname) do + section(sectname, segname, data) + end + end + + # Define a standard BSS section under the current segment (if present). + # This behaves similarly to the data method but accepts a VM size instead + # of a blob, and no data is written to file since this section is for + # uninitialized data. + # + def bss(vmsize, sectname = '__bss', segname='__DATA') + segment_based_on_filetype(segname, :writable => true) do + section(sectname, segname, '', vmsize) + end + end + + # Define a relocation table. Usually between segments and the + # symbol table. + # + # Accepts an array of relocation info structs. + def reloc(reloc_info) + @data << if reloc_info.respond_to?(:join) + reloc_info.map {|r| r.serialize}.join + else + reloc_info + end + @reloc_info = reloc_info.map {|x| x.clone} + return self + end + + # Define a symbol table. This should usually be placed at the end of the + # file. + # + # This function is overloaded to accept either an array of Nlist structs + # packed into a byte string (i.e. a C array) and a string table, or a + # single parameter: any type of SymbolTable. + + def symtab(nlist_ary_or_symtab, stab = nil) + if stab.nil? + symtab = nlist_ary_or_symtab + stab = symtab.stab + nlist_ary = symtab.nlist_ary + else + nlist_ary = nlist_ary_or_symtab + end + + load_command(LC_SYMTAB) do |st| + st[:nsyms] = nlist_ary.size + st[:strsize] = stab.size + # symoff and stroff are filled in when offsets are recalculated. + end + + # puts ">>> Defining symbol table:" + # puts ">>> #{nlist_ary.size} symbols" + # puts ">>> stab = #{stab.inspect}" + # puts ">>> nlist_ary = #{nlist_ary.inspect}" + # puts ">>> (serialized) = #{nlist_ary.map{|n|n.serialize}.join.inspect}" + + @data << nlist_ary.map {|n| n.serialize}.join + @data << stab + return self + end + + + # Serialize the entire MachO file into a byte string. This is simple + # thanks to CStruct#serialize. + + def serialize + # TODO sanity checks, e.g. assert(@header[:ncmds] == @load_command.size) + # ... perhaps an option to recalculate such data as well. + + # Now that we have all the pieces of the file defined we can calculate + # the file offsets of segments and sections. + calculate_offsets + + ################################### + # Mach-O file Part 1: Mach Header # + ################################### + @header.serialize + + + ##################################### + # Mach-O file Part 2: Load Commands # + ##################################### + # dump each load command (which include the section headers under them) + @load_commands.map do |cmd| + sects = @sections[cmd[:segname]] rescue [] + sects.inject(cmd.serialize) do |data, sect| + data + sect.serialize + end + end.join + + + ################################### + # Mach-O file Part 3: Binary data # + ################################### + @data.join + end + + + # Update the file offsets in segments and sections. + + def calculate_offsets + + # Maintain the offset into the the file on disk. This is used + # to update the various structures. + offset = @header.bytesize + + # First pass over load commands. Most sizes are filled in here. + @load_commands.each do |cmd| + case cmd[:cmd] + + when LC_SEGMENT + seg = cmd + sections = @sections[seg[:segname]] + section_size = sections.size * Section.bytesize + section_vm_size = sections.inject(0) { |total, sect| total + sect[:size] } + section_disk_size = sections.inject(0) do |total, sect| + total + @section_disk_size[sect[:sectname]] + end + + ### TODO this should be redundant. try commenting it out one day. + seg[:nsects] = sections.size + seg[:cmdsize] = seg.bytesize + section_size + ### + + seg[:vmsize] = section_vm_size + seg[:filesize] = section_disk_size + + when LC_SYMTAB + # nop + + else + raise "unsupported load command: #{cmd.inspect}" + end + + offset += cmd[:cmdsize] + end + + + # offset now points to the end of the Mach-O headers, or the beginning + # of the binary blobs of section data at the end. + + # Second pass over load commands. Fill in file offsets. + @load_commands.each do |cmd| + case cmd[:cmd] + + when LC_SEGMENT + seg = cmd + sections = @sections[seg[:segname]] + seg[:fileoff] = offset + sections.each do |sect| + sect[:offset] = offset + offset += @section_disk_size[sect[:sectname]] + end + + when LC_SYMTAB + if @reloc_info + # update text section with relocation info + __text = @sections[@text_segname][@text_sect_index] + __text[:reloff] = offset + __text[:nreloc] = @reloc_info.length + offset += @reloc_info.first.bytesize * @reloc_info.length + end + st = cmd + st[:symoff] = offset + offset += st[:nsyms] * Nlist.bytesize + st[:stroff] = offset + offset += st[:strsize] + + + # No else clause is necessary, the first iteration should have caught them. + + end + + end # @load_commands.each + + end # def calculate_offsets + + + ####### + private + ####### + + def segname_based_on_filetype(segname) + case @header[:filetype] + when MH_OBJECT + '' + when MH_EXECUTE + segname + else + raise "unsupported MachO file type: #{@header.inspect}" + end + end + + + end + + end +end diff --git a/lib/compiler/asm/macho/relocation_info.rb b/lib/compiler/asm/macho/relocation_info.rb new file mode 100644 index 0000000..afb4725 --- /dev/null +++ b/lib/compiler/asm/macho/relocation_info.rb @@ -0,0 +1,35 @@ +require 'compiler/cstruct' + +# The MachO module contains constants and structures related to the +# Mach Object format (Mach-O). They are relevant to Darwin on OS X. +# +# Constants and structures as defined in /usr/include/mach-o/loader.h +# on Mac OS X Leopard (10.5.7). Also see , +# , and . + +class Compiler + module MachO + + class RelocationInfo < CStruct + int32 :r_address # offset in the section to what is being relocated + uint32 :r_info + end + + # NOTE: r_info is a packed bit field with the following members: + # + # (CStruct should eventually support bitfields, but doesn't right now.) + # + # r_symbolnum : 24 -- symbol index if r_extern == 1 or section ordinal if r_extern == 0 + # r_pcrel : 1 -- was relocated pc relative already + # r_length : 2 -- 0=byte, 1=word, 2=long, 3=quad + # r_extern : 1 -- 1 for exported symbols, 0 othewise + # r_type : 4 -- if not 0, machine specific relocation type (always 0) + + R_ABS = 0 # Absolute relocation type + # (r_symbolnum == R_ABS for absolute symbols that don't need reloc) + + # Relocation types (r_type) + GENERIC_RELOC_VANILLA = 0 + + end +end diff --git a/lib/compiler/asm/macho/section.rb b/lib/compiler/asm/macho/section.rb new file mode 100644 index 0000000..fb29519 --- /dev/null +++ b/lib/compiler/asm/macho/section.rb @@ -0,0 +1,34 @@ +require 'compiler/cstruct' + +# The MachO module contains constants and structures related to the +# Mach Object format (Mach-O). They are relevant to Darwin on OS X. +# +# Constants and structures as defined in /usr/include/mach-o/loader.h +# on Mac OS X Leopard (10.5.7). Also see , +# , and . + +class Compiler + module MachO + + class Section < CStruct + string :sectname, 16 + string :segname, 16 + uint32 :addr + uint32 :size + uint32 :offset + uint32 :align + uint32 :reloff + uint32 :nreloc + uint32 :flags + uint32 :reserved1 + uint32 :reserved2 + end + + # Values for the type bitfield (mask 0x000000ff) of the flags field. + # (incomplete!) + S_REGULAR = 0x0 + S_ZEROFILL = 0x1 + S_CSTRING_LITERALS = 0x2 + + end +end diff --git a/lib/compiler/asm/macho/structs.rb b/lib/compiler/asm/macho/structs.rb new file mode 100644 index 0000000..65130b5 --- /dev/null +++ b/lib/compiler/asm/macho/structs.rb @@ -0,0 +1,53 @@ +require 'compiler/macho/mach_header' +require 'compiler/macho/load_commands' +require 'compiler/macho/section' +require 'compiler/macho/relocation_info' + +# The MachO module contains constants and structures related to the +# Mach Object format (Mach-O). They are relevant to Darwin on OS X. +# +# Constants and structures as defined in /usr/include/mach-o/loader.h +# on Mac OS X Leopard (10.5.7). Also see , +# , and . + +class Compiler + module MachO + + ######################## + # Symbol table support # + ######################## + + # Nlist is used to describe symbols. + class Nlist < CStruct + uint32 :n_strx # Index into string table. Index of zero is the empty string. + uint8 :n_type # Type flag (see below). + uint8 :n_sect # Section number (from 1) or NO_SECT. + uint16 :n_desc # TODO See . + uint32 :n_value # The symbol's value (or stab offset). + end + + # Type flag (see for more details) + # --------- + # + # This field consists of four bitfields: + # + # uchar N_STAB : 3 + # uchar N_PEXT : 1 + # uchar N_TYPE : 3 + # uchar N_EXT : 1 + # + N_STAB = 0xe0 # if any bits set => symbolic debugging info + N_PEXT = 0x10 # private external symbol bit + N_TYPE = 0x0e # mask for the type bits + N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals) + + # Values for N_TYPE. (incomplete!) + N_UNDF = 0x0 # undefined, n_sect == NO_SECT + N_ABS = 0x2 # absolute, n_sect == NO_SECT + N_SECT = 0xe # defined in section number n_sect + + NO_SECT = 0 + MAX_SECT = 255 + + end +end diff --git a/lib/compiler/asm/macho/symbol.rb b/lib/compiler/asm/macho/symbol.rb new file mode 100644 index 0000000..ab36f9c --- /dev/null +++ b/lib/compiler/asm/macho/symbol.rb @@ -0,0 +1,31 @@ +require 'compiler/macho' + +class Compiler + module MachO + + class Symbol + + attr_accessor :name, :type, :segnum, :desc, :value + + def initialize(name, type, segnum, desc, value) + @name = name + @type = type + @segnum = segnum + @desc = desc + @value = value + end + + + def to_nlist(strx) + Nlist.new(strx, @type, @segnum, @desc, @value) + end + + + def to_s + @name + end + + end + + end +end diff --git a/lib/compiler/asm/macho/symbol_table.rb b/lib/compiler/asm/macho/symbol_table.rb new file mode 100644 index 0000000..2562a42 --- /dev/null +++ b/lib/compiler/asm/macho/symbol_table.rb @@ -0,0 +1,88 @@ +require 'compiler/macho/structs' +require 'compiler/macho/symbol' +require 'compiler/asm/symbol_table' + +class Compiler + module MachO + + class SymbolTable < Assembler::SymbolTable + + def make_symbols(vars, base_addr, type, segnum) + # Note: Sorting a Ruby hash gives an alist, e.g. [[, ], ...] + # We can use map on it as if it were a hash so it works nicely. + vars.sort { |a,b| a[1] <=> b[1] }. + map do |name, offset| + Symbol.new(name, type, segnum, 0, base_addr + offset) + end + end + + def all_symbols + # TODO FIXME: + # - the last var exported ends up after main somewhere... WTF?! + # - All labels are exported. This should be changed and only functions exported! + + section = 1 + + # Functions (section #1, __text) + symbols = make_symbols(@labels, text_offset, N_SECT | N_EXT, section) + section += 1 + + # Constants (section #2, __const) + if @consts.size > 0 + symbols += make_symbols(@consts, const_offset, N_SECT, section) + section += 1 + end + + # Variables (section #3, __bss) + if @vars.size > 0 + symbols += make_symbols(@vars, bss_offset, N_SECT, section) + end + + return symbols + end + + # this is fairly stupid but works + def bss_section + @consts.size > 0 ? 3 : 2 + end + + def nlist_ary + symbols = {} + strx = 1 + ary = [] + all_symbols.each do |sym| + key = sym.name.to_sym + unless symbols.has_key?(key) + symbols[key] = strx + strx += sym.name.length + 1 # +1 for the null byte + end + ary << sym.to_nlist(symbols[key]) + end + return ary + end + + def stab + # The empty strings result in a string that begins and ends with a null byte + ['', all_symbols, ''].flatten.map { |sym| sym.to_s }.join("\0") + end + + def reloc(r_address, r_symbolnum = 0, r_length = 2, r_extern = 0, r_pcrel = 0, r_type = 0) + r_info = (r_type << 28) | (r_extern << 27) | (r_length << 25) | + (r_pcrel << 24) | r_symbolnum + @reloc_info << RelocationInfo.new(r_address, r_info) + end + + def reloc_info + n = bss_section + @reloc_info.each {|r| r[:r_info] |= n} + end + + def calculate_offsets(text_size) + @const_offset = @text_offset + text_size + @bss_offset = @const_offset + @const_size + end + + end + + end +end diff --git a/lib/compiler/asm/object_file.rb b/lib/compiler/asm/object_file.rb new file mode 100644 index 0000000..c65c835 --- /dev/null +++ b/lib/compiler/asm/object_file.rb @@ -0,0 +1,28 @@ +class Compiler + module ASM + + class UnimplementedMethodError < RuntimeError; end + + + # Abstract base class. + class ObjWriter + + def write!(filename) + File.open(filename, 'wb') do |file| + file.print(serialize) + end + end + + def fail(name) + raise UnimplementedMethodError.new(name) + end + + # These methods must be defined for most uses of the library. + %w[header segment section text data bss symtab serialize].each do |name| + define_method(name) { fail(name) } + end + + end + + end +end diff --git a/lib/compiler/asm/register_proxy.rb b/lib/compiler/asm/register_proxy.rb new file mode 100644 index 0000000..7c0fedc --- /dev/null +++ b/lib/compiler/asm/register_proxy.rb @@ -0,0 +1,70 @@ +class Compiler + module ASM + + # Acts like a register and can be used as the base or index in an + # effective address. + # + # e.g. [EAX] or [ESI+EBX] or [EAX + 0xff] or [EAX + EDX * 2] + class RegisterProxy + + attr_reader :name, :size, :regnum + attr_reader :base, :index, :scale + + + def initialize(name, size, regnum) + @name = name # attrs are read-only so sharing is ok + @size = size + @regnum = regnum + @base = self + end + + + def +(index) + raise "index already specified" if @index + new_reg = self.clone + new_reg.instance_variable_set('@index', index) + new_reg + end + + + def *(scale) + raise "index must come first" unless @index + raise "scale already specified" if scale + raise "unsupported scale: #{scale}" unless scale.to_s.match(/^[1248]$/) + @scale = scale + self + end + + + def scale? + @scale + end + + + def index? + @index + end + + + def register? + @scale.nil? && @index.nil? + end + + + + def to_s + [ @name.to_s, + @index && "+#{@index}", + @scale && "*#{@scale}" + ].compact.join + end + + + def inspect + to_s + end + + end + + end +end diff --git a/lib/compiler/asm/symbol_table.rb b/lib/compiler/asm/symbol_table.rb new file mode 100644 index 0000000..1ed54b1 --- /dev/null +++ b/lib/compiler/asm/symbol_table.rb @@ -0,0 +1,99 @@ +class Compiler + module ASM + + + # Abstract symbol table. + # + # Basically a big map of variable, constant, and label names to + # offsets within their respective sections. Final addresses are + # calculated from these offsets on the 2nd pass when we know where + # things will actually live in memory. + + class SymbolTable + + attr_accessor :text_offset, :bss_offset, :const_offset + attr_reader :const_data, :const_size, :bss_size, :reloc_info + + def initialize + @vars = {} # Map of variable names to offsets. (bss vars) + @consts = {} # Map of constant names to offsets. + @funcs = {} # map of function names to offsets. + + # Initial data to load into memory (data for __DATA segment). + @const_data = '' + + @const_size = 0 # Size of const section. + @bss_size = 0 # Size of bss section. + + # Map names to locations. + @labels = Hash.new {|h, key| raise "undefined label: #{key}"} + @num_labels = 0 # Used to generate unique labels. + @num_labels_with_suffix = Hash.new(0) + + # Relocation info. Subclasses should define a reloc method. + @reloc_info = [] + + @text_offset = 0 + @bss_offset = 0 + @const_offset = 0 + end + + # Generate a unique label. + def unique_label(suffix = nil) + @num_labels += 1 + if suffix + @num_labels_with_suffix[suffix] += 1 + suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}" + end + name = "L#{sprintf "%06d", @num_labels}#{suffix}" + return name + end + + def define_label(name, offset) + @labels[name] = offset + return name + end + + + def lookup_label(name) + @labels[name] + end + + + def define_var(name, bytes) + @vars[name] = @bss_size + @bss_size += bytes + end + + + def define_const(name, value, bytes) + @consts[name] = @const_size + @const_size += bytes + @const_data << [value].pack('i') + end + + def define_func(name, offset) + @funcs[name] = offset + end + + + def var(name) + @vars[name] + end + + def var?(name) + @vars.has_key?(name) + end + + def const(name) + @consts[name] + end + + def const?(name) + @consts.has_key?(name) + end + + end + + end +end diff --git a/lib/compiler/asm/text_assembler.rb b/lib/compiler/asm/text_assembler.rb new file mode 100644 index 0000000..6821d6c --- /dev/null +++ b/lib/compiler/asm/text_assembler.rb @@ -0,0 +1,73 @@ +# sjs +# may 2009 + +require 'compiler/asm/assembler' + +class Compiler + module ASM + + class TextAssembler < Assembler + + def initialize(delegate) + super(delegate) + + @vars = {} # Symbol table, maps names to locations in BSS. + @data = '' + @bss = '' + @code = '' + + unless File.readable?(template_filename) + raise "unsupported platform/arch: #{delegate.platform}/#{arch.name}" + end + end + + def template_filename + @template_filename ||= File.join(File.dirname(__FILE__), arch.name, "template.#{delegate.platform}.asm") + end + + # Define a constant + def const(name, value) + end + + # Define a variable with the given name and size in bytes. + def define_var(name, bytes = arch.bytes) + unless var?(name) + define_var_impl(name, bytes) + else + STDERR.puts "[warning] attempted to redefine #{name}" + end + end + + def define_var_impl(name, bytes = arch.bytes) + end + + def var(name) + @vars[name] + end + alias_method :var?, :var + + + # Emit a line of code wrapped between a tab and a newline. + def emit(code, options = {}) + tab = options.has_key?(:tab) ? options[:tab] : "\t" + @code << "#{tab}#{code}\n" + end + + def label(name = nil) + # FIXME + name = super + @labels[name] = name + return name + end + + def output + end + + def emit_label(name = label) + emit("#{name}:", tab: nil) + end + + end + + end +end diff --git a/lib/compiler/asm/variable_proxy.rb b/lib/compiler/asm/variable_proxy.rb new file mode 100644 index 0000000..aa9d4f8 --- /dev/null +++ b/lib/compiler/asm/variable_proxy.rb @@ -0,0 +1,43 @@ +class Compiler + module ASM + + # Wrap a variable's address so that we can perform arithmetic on it + # before resolving it when we know where things will go in memory. + # All we do is catch arithmetic ops and then provide a means to + # resolve a final addres by replaying them later. + # + # e.g. [symtab.var('i')] or [symtab.var('i') * 2] + class VariableProxy + + attr_reader :name + attr_accessor :ops + + def initialize(name) + @name = name + @ops = [] + end + + %w[+ * / - % & |].each do |op| + define_method(op) do |*args| + new_proxy = self.class.new(@name) + new_proxy.ops << [op, *args] + return new_proxy + end + end + + # XXX should this perhaps use the offset instead? + def resolve(base_addr) + @ops.inject(base_addr) do |addr, op| + addr.send(*op) + end + end + + # Overriden by ConstantProxy + def const? + false + end + + end + + end +end diff --git a/lib/compiler/asm/x86/arch.rb b/lib/compiler/asm/x86/arch.rb new file mode 100644 index 0000000..1c4f6a8 --- /dev/null +++ b/lib/compiler/asm/x86/arch.rb @@ -0,0 +1,42 @@ +require 'compiler/asm/arch' + +class Compiler + module ASM + module X86 + + module Arch + + BINARY_PREAMBLE = { + 'linux' => [], + + 'darwin' => [ 0x55, # push ebp + 0x89, 0xe5, # mov ebp, esp + 0x81, 0xec, 8, 0, 0, 0 # sub esp, 8 + ] + } + + BINARY_POSTAMBLE = { + 'linux' => [ 0x89, 0xc3, # mov ebx, eax (exit code) + 0xb8, 1, 0, 0, 0, # mov eax, 1 + 0xcd, 0x80 # int 0x80 + ], + + 'darwin' => [ 0xc9, # leave + 0xc3 # ret + ] + } + + def self.instance + @instance ||= ASM::Arch.new({ + 'bits' => 32, + 'word_bits' => 16, + 'preamble' => BINARY_PREAMBLE, + 'postamble' => BINARY_POSTAMBLE + }) + end + + end + + end + end +end diff --git a/lib/compiler/asm/x86/binary_assembler.rb b/lib/compiler/asm/x86/binary_assembler.rb new file mode 100644 index 0000000..cd2d50a --- /dev/null +++ b/lib/compiler/asm/x86/binary_assembler.rb @@ -0,0 +1,866 @@ +# A very basic x86 assembler library for Ruby. Generally the +# instructions implemented are the minimum needed by the compiler this +# is written for. x86 is just too big. +# +# sjs +# may 2009 +# +# Refer to the Intel[1] or AMD documentationon on x86 for explanations +# of Mod-R/M encoding, the Scale-Index-Base (SIB) byte, opcode groups. +# +# The start and exit shell codes were obtained by disassembling +# minimal binaries on the respective platforms. + +require 'json' +require 'compiler/asm/binary_assembler' +require 'compiler/asm/x86/arch' +require 'compiler/asm/x86/registers' + +class Compiler + module ASM + module X86 + + class BinaryAssembler < ASM::BinaryAssembler + + include Registers + + DEBUG_OUTPUT = false + + SIGNED_BYTE = -128..127 + + # This is used for encoding instructions. Just as the equivalent + # assembly would contain "BITS 32", binary is generated for 32-bit + # protected mode. + DEFAULT_OPERAND_SIZE = :dword + + SIZE_MAP = { + byte: 8, + word: 16, + dword: 32 + } + + def emit_entry_point + # Always include the _main entry point in our symbol table. It begins at the + # beginning of the __TEXT segment, 0x0. + define_label('_main') + end + + # register for return values + def return_reg + EAX + end + + + ### Virtual ISA used by parser. + + def load(n) + mov(return_reg, n) + end + + def load_var(name) + mov(return_reg, [var(name)]) + end + + def store_var(name, reg) + mov([name], reg) + end + + + # stack_* methods expect op1 on the stack + + def stack_add(reg) + pop(EBX) + add(reg, EBX) + end + + def stack_sub(reg) + pop(EBX) + sub(reg, EBX) + end + + def stack_mul_signed(reg) + pop(EBX) + imul(EBX) + end + + def stack_div(reg) + pop(EBX) # Get op1 + xchg(reg, EBX) # Swap the divisor and dividend into + # the correct places. + + # idiv uses edx:eax as the dividend so we need to ensure that edx + # is correctly sign-extended w.r.t. eax. + cdq # Sign-extend eax into edx (Convert Double to Quad). + + idiv(EBX) # Divide a (eax) by b (ebx). + end + + def stack_or(reg) + pop(EBX) + self.or(reg) + end + + def stack_xor(reg) + pop(EBX) + xor(reg) + end + + def stack_and(reg) + pop(EBX) + self.and(reg) + end + + def compare(reg, n) + cmp(reg, n) + end + + def mov_reg_imm(reg, imm) + mov(reg, imm) + end + + + ############################ + ### Instruction Encoding ### + ############################ + + def emit_dword(num) + num_to_quad(num).each { |byte| emit_byte(byte) } + end + + def emit_modrm(addr, reg = 0) + mod = 0 + rm = 0 + disp8 = nil + disp32 = nil + sib = nil + var = nil # variable proxy + + # effective address + if addr.is_a?(Array) + eff_addr = addr[1] || addr[0] # works with or without size prefix + raise "invalid effective address: #{addr.inspect}" unless eff_addr + case eff_addr + + when RegisterProxy + + # Simple register addressing, e.g. [ESI]. + # + # mod == 00 + if eff_addr.register? + mod = 0 + + # [ESP] and [EBP] can't be encoded directly. The + # workaround is to use SIB to emit the code for [ESP+0] + # and [EBP+0] instead. + # + # To emit [ESP+0] we use SIB with scale=1 index=0 base=ESP. + if eff_addr == ESP + rm = 4 # SIB + sib = make_sib(1, 0, eff_addr) + + # For [EBP+0] we can encode [EBP]+disp8 directly. + elsif eff_addr == EBP + mod = 1 + rm = eff_addr.regnum + disp8 = 0 + else + rm = eff_addr.regnum + end + + # Bare displacements, e.g. [32] or [0x1234abcd] + elsif eff_addr.index? && eff_addr.index.is_a?(Numeric) + + # disp8, mod == 01 + if SIGNED_BYTE === eff_addr.index + mod = 1 + disp8 = eff_addr.index + + # disp32, mod == 10 + elsif SignedRange === eff_addr.index + mod = 2 + disp32 = eff_addr.index + + else + raise "address must fit in 32 bits, this doesn't: #{eff_addr.index}" + end + + # SIB + elsif eff_addr.index? + # scale-index-base, mod == 00 and rm == 100 + rm = 4 + sib = make_sib(eff_addr.scale || 1, eff_addr.index, eff_addr.base) + + else + raise "unsupported effective address: #{addr.inspect}" + end + + # disp32, mod == 00 + when Numeric + mod = 0 + rm = 5 # 101 + disp32 = eff_addr + + when VariableProxy + mod = 0 + rm = 5 + var = eff_addr + + else + raise "unsupported effective address: #{addr.inspect}" + end + + # register content, mod == 11 + elsif addr.register? + mod = 3 + rm = addr.regnum + + # XXX TODO elsif addr.respond_to?(:name) + # (VariableProxy) => [:(var|const), addr.name] + # + # i.e. a pointer to that var + + else + raise "unsupported effective address: #{addr.inspect}" + end + + emit_byte((mod << 6) | (reg << 3) | rm) + emit_byte(sib) if sib + + emit_byte(disp8) if disp8 + + emit_dword(disp32) if disp32 + emit_var(var) if var + end + + + def make_sib(scale, index, base) + if [1,2,4,8].include?(scale) + scale = log2(scale).to_i + else + raise "unsupported SIB scale: #{scale}, should be 1, 2, 4, or 8" + end + if index == 0 + index = 4 + elsif index.respond_to?(:regnum) + index = index.regnum + end + base = base.regnum if base.respond_to?(:regnum) + return (scale << 6) | (index << 3) | base + end + + + def register?(op, size = DEFAULT_OPERAND_SIZE) + op.is_a?(RegisterProxy) && op.size == size || + op.respond_to?(:size) && op.size == SIZE_MAP[size] + end + + def immediate?(op, size = DEFAULT_OPERAND_SIZE) + bits = SIZE_MAP[size] || size + op.is_a?(Numeric) && op >= -(2 ** bits / 2) && op <= (2 ** bits - 1) + end + + # Return true if op is a valid operand of the specified size. + # (:byte, :word, :dword) + # + # Valid operands are: + # + # * registers + # + # * effective addresses (wrapped in an array to look like nasm code) + # + # XXX This method is pretty ugly. + def rm?(op, size = DEFAULT_OPERAND_SIZE) + is_register = register?(op, size) + + if op.is_a?(Array) + case op.size + + # [register/memory] + when 1 + is_reg_or_mem = [Numeric, RegisterProxy, VariableProxy].include?(op[0].class) + + # [, memory] + when 2 + is_size_and_mem = op[0] == size && [Numeric, RegisterProxy, VariableProxy].include?(op[1].class) + + end + + else + is_reg_or_mem = false + is_size_and_mem = false + end + + is_register || is_reg_or_mem || is_size_and_mem + end + + def offset?(addr, size = DEFAULT_OPERAND_SIZE) + addr.is_a?(Array) && (addr[0].is_a?(Numeric) || addr[0].is_a?(VariableProxy)) + end + + def constant?(op) + immediate?(op) || offset?(op) + end + + def log2(x, tol = 1e-13) + result = 0.0 + + # Integer part + while x < 1 + resultp -= 1 + x *= 2 + end + while x >= 2 + result += 1 + x /= 2 + end + + # Fractional part + fp = 1.0 + while fp >= tol + fp /= 2 + x *= x + if x >= 2 + x /= 2 + result += fp + end + end + result + end + + + # 9 versions of the mov instruction are supported: + # 1. mov reg32, immediate32 + # 2a. mov reg32, r/m32 + # 2b. mov eax, memoffset32 + # 3a. mov r/m32, reg32 + # 3b. mov memoffset32, eax + # 4. mov r/m32, immediate32 + # 5. mov r/m8, imm8 + # 6. mov reg8, r/m8 + # 7. mov r/m8, reg8 + def mov(dest, src) + + # These 2 are used in the same way, just the name differs to make the + # meaning clear. They are 4-byte values that are emited at the end if + # they are non-nil. Only one of them will be emited, and if both are + # non-nil that one is immediate. + immediate = nil + offset = nil + + # This is an array of arguments to be passed to emit_modrm, if it is set. + modrm = nil + + # version 1: mov r32, imm32 + if register?(dest) && immediate?(src) + opcode = 0xb8 + dest.regnum # dest encoded in instruction + immediate = src + + # version 2a: mov r32, r/m32 + elsif register?(dest) && rm?(src) + # version 2b: mov eax, moffs32 + if dest == EAX && offset?(src) + opcode = 0xa1 + offset = src[0] + else + opcode = 0x8b + modrm = [src, dest.regnum] + end + + # version 3a: mov r/m32, r32 + elsif rm?(dest) && register?(src) + # version 3b: mov moffs32, eax + if offset?(dest) && src == EAX + opcode = 0xa3 + offset = dest[0] + else + opcode = 0x89 + modrm = [dest, src.regnum] + end + + # version 4: mov r/m32, imm32 + elsif rm?(dest) && immediate?(src) + opcode = 0xc7 + modrm = [dest, 0] + immediate = src + + # version 5: mov r/m8, imm8 + # + # It's important that this check is first because src integers can + # pass the register? check in version 7. + elsif rm?(dest, :byte) && immediate?(src, :byte) + opcode = 0xc6 + modrm = [dest, 0] + immediate_byte = src + + # version 6: mov r8, r/m8 + elsif register?(dest, :byte) && rm?(src, :byte) + opcode = 0x8a + modrm = [src, dest.regnum] + + # version 7: mov r/m8, r8 + elsif rm?(dest, :byte) && register?(src, :byte) + opcode = 0x88 + modrm = [dest, src.regnum] + + else + # puts "rm?(dest): #{rm?(dest)}\t\trm?(src): #{rm?(src)}" + # puts "register?(dest): #{register?(dest)}\t\tregister?(src): #{register?(src)}" + # puts "immediate?(dest): #{immediate?(dest)}\t\timmediate?(src): #{immediate?(src)}" + # puts "offset?(dest): #{offset?(dest)}\t\toffset?(src): #{offset?(src)}" + # puts "rm?(dest, :byte): #{rm?(dest)}\t\trm?(src, :byte): #{rm?(src, :byte)}" + # puts "immediate?(dest, :byte): #{immediate?(dest)}\t\timmediate?(src, :byte): #{immediate?(src, :byte)}" + raise "unsupported MOV instruction, #{dest.inspect}, #{src.inspect}" + end + + dword = immediate || offset + + asm do + emit_byte(opcode) + emit_modrm(*modrm) if modrm + + if dword.is_a?(VariableProxy) + if dword.const? + emit_const(dword) + else + emit_var(dword) + end + + elsif dword + emit_dword(dword) + + elsif immediate_byte + emit_byte(immediate_byte) + + end + end + end + + + def movzx(dest, src) + + # movzx Gv, ?? + if register?(dest) + + opcode = case + when rm?(src, :byte) + 0xb6 # movzx Gv, Eb + when rm?(src, :word) + 0xb7 # movzx Gv, Ew + else + raise "unsupported MOVZX instruction, dest=#{dest.inspect} << src=#{src.inspect} >>" + end + asm do + emit_byte(0x0f) + emit_byte(opcode) + emit_modrm(src, dest.regnum) + end + + else + + raise "unimplemented MOVZX instruction, << dest=#{dest.inspect} >> src=#{src.inspect}" + end + end + + + def xchg(dest, src) + if dest == EAX && register?(src) + asm { emit_byte(0x90 + src.regnum) } + # swap the args if EAX comes last so we only need to handle one case below. + elsif src == EAX && register?(dest) + xchg(src, dest) + elsif rm?(dest) && register?(src) + asm do + emit_byte(0x87) + emit_modrm(dest, src.regnum) + end + elsif register?(dest) && rm?(src) + asm do + emit_byte(0x87) + emit_modrm(src, dest.regnum) + end + else + raise "unsupported XCHG instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + # convert double to quad (sign-extend EAX into EDX) + def cdq + asm { emit_byte(0x99) } + end + + + def add(dest, src) + # add r/m32, imm8 + if rm?(dest) && immediate?(src, :byte) + asm do + emit_byte(0x83) + emit_modrm(dest, 0) + emit_byte(src) + end + + # add r/m32, imm32 + elsif rm?(dest) && immediate?(src) + asm do + emit_byte(0x81) + emit_modrm(dest, 0) + emit_dword(src) + end + + # add eax, imm32 + elsif dest == EAX && immediate?(src) + asm do + emit_byte(0x05) + emit_dword(src) + end + + # add reg32, r/m32 + elsif register?(dest) && rm?(src) + asm do + emit_byte(0x03) + emit_modrm(src, dest.regnum) + end + + else + raise "unsupported ADD instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + def sub(dest, src) + # sub r/m32, imm8 + if rm?(dest) && immediate?(src, :byte) + asm do + emit_byte(0x83) + emit_modrm(dest, 5) + emit_byte(src) + end + + # sub r/m32, imm32 + elsif rm?(dest) && immediate?(src) + asm do + emit_byte(0x81) + emit_modrm(dest, 5) + emit_dword(src) + end + + # sub r/m32, reg32 + elsif rm?(dest) && register?(src) + asm do + emit_byte(0x29) + emit_modrm(dest, src.regnum) + end + + # sub reg32, r/m32 + elsif register?(dest) && rm?(src) + asm do + emit_byte(0x2b) + emit_modrm(src, dest.regnum) + end + + else + raise "unsupported SUB instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + # Signed multiply. + def imul(*ops) + case ops.size + + when 1 + group3(ops[0], 5, 'IMUL') + + when 2 + dest, src = ops + raise "unsupported IMUL instruction, dest=#{dest.inspect} src=#{src.inspect}" + + else + raise ArgumentError, "IMUL accepts exactly 1 or 2 operands (got #{ops.inspect})" + end + end + + # Unsigned multiply. + def mul(op) + group3(op, 4, 'MUL') + end + + + # Signed divide. + def idiv(op) + group3(op, 7, 'IDIV') + end + + # Unsigned divide. + def div(op) + group3(op, 6, 'DIV') + end + + + def inc(op) + asm do + if register?(op) + emit_byte(0x40 + regnum(op)) + elsif rm?(op) + # emit_byte(0xff) + raise "unimplemented" + else + raise "unsupported op #{op}, wanted r32 or r/m32" + end + end + end + + + def dec(op) + if register?(op) + # dec reg32 + asm { emit_byte(0x48 + op.regnum) } + else + raise "unsupported DEC instruction, op=#{op.inspect}" + end + end + + + def shr(op, n) + + # shr r/m??, imm8 + if SIGNED_BYTE === n + + opcode = register?(op, :byte) ? 0xc0 : 0xc1 + + asm do + emit_byte(opcode) + emit_modrm(op, 5) + emit_byte(n) + end + + else + raise "unsupported SHR instruction, op=#{op.inspect}, n=#{n.inspect}" + end + + end + + + def and_(dest, src) + if rm?(dest) && register?(src) + asm do + emit_byte(0x21) + emit_modrm(dest, src.regnum) + end + elsif rm?(dest, 8) && immediate?(src, 8) + asm do + emit_byte(0x80) + emit_modrm(dest, 4) + emit_byte(src) + end + else + raise "unsupported AND instruction: dest=#{dest.inspect}, src=#{src.inspect}" + end + end + alias_method :and, :and_ + + def or_(dest, src) + if rm?(dest) && register?(src) + asm do + emit_byte(0x9) + emit_modrm(dest, src.regnum) + end + elsif rm?(dest, 8) && immediate?(src, 8) + asm do + emit_byte(0x80) + emit_modrm(dest, 1) + emit_byte(src) + end + else + raise "unsupported OR instruction: dest=#{dest.inspect}, src=#{src.inspect}" + end + end + alias_method :or, :or_ + + def xor(dest, src) + # xor r/m32, reg32 + if rm?(dest) && register?(src) + asm do + emit_byte(0x31) + emit_modrm(dest, src.regnum) + end + + else + raise "unsupported XOR instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + def not_(op) + group3(op, 2, 'NOT') + end + alias_method :not, :not_ + + + def neg(op) + group3(op, 3, 'NEG') + end + + + def push(op) + # push reg32 + if register?(op) + asm { emit_byte(0x50 + op.regnum) } + + elsif immediate?(op, :byte) + asm do + emit_byte(0x6a) + emit_byte(op) + end + + elsif immediate?(op) + asm do + emit_byte(0x68) + emit_dword(op) + end + + else + raise "unsupported PUSH instruction: op=#{op.inspect}" + end + end + + + def pop(op) + # pop reg32 + if register?(op) + asm { emit_byte(0x58 + op.regnum) } + + else + raise "unsupported POP instruction: op=#{op.inspect}" + end + end + + + def cmp(op1, op2) + # cmp r/m32, reg32 + if rm?(op1) && register?(op2) + asm do + emit_byte(0x39) + emit_modrm(op1, op2.regnum) + end + + # cmp eax, imm32 + elsif op1 == EAX && immediate?(op2) + asm do + emit_byte(0x3d) + emit_dword(op2) + end + + else + raise "unsupported CMP instruction: op1=#{op1.inspect} op2=#{op2.inspect}" + end + end + + + # Only jmp rel32 is supported. + def jmp(label) + asm do + emit_byte(0xe9) + emit_label(label) + end + end + + # These all jump near (rel32). + JccOpcodeMap = Hash.new { |key| raise "unsupported Jcc instruction: #{key}" }. + merge({ + :jc => 0x82, # carry (CF=1) + :je => 0x84, # equal (ZF=1) --- same as jz + :jg => 0x8f, # greater (ZF=0 and SF=OF) + :jl => 0x8c, # less than (SF!=OF) + :jne => 0x85, # not equal (ZF=0) --- same as jnz + :jng => 0x8e, # not greater than (ZF=1 or SF!=OF) + :jnl => 0x8d, # not less than (SF=OF) + :jnz => 0x85, # not zero (ZF=0) + :jo => 0x80, # overflow (OF=1) + :js => 0x88, # sign (SF=1) + :jz => 0x84 # zero (ZF=1) + }) + + # Only Jcc rel32 is supported. + def jcc(instruction, label) + opcode = JccOpcodeMap[instruction] + asm do + emit_byte(0x0f) + emit_byte(opcode) + emit_label(label) + end + end + + JccOpcodeMap.keys.each do |name| + define_method(name) do |label| + jcc(name, label) + end + end + + + def lea(r32, mem) + asm do + emit_byte(0x8d) + emit_modrm(mem, r32.regnum) + end + end + + + def int(n) + asm do + emit_byte(0xcd) + emit_byte(n) + end + end + + + def ret + asm { emit_byte(0xc3) } + end + + + def leave + asm { emit_byte(0xc9) } + end + + + # NOTE: LOOP only accepts a 1-byte signed offset. Don't use it. + def loop_(label) + real_ip = ip + 2 # loop instruction is 2 bytes + delta = @symtab.lookup_label(label) - real_ip + unless SIGNED_BYTE === delta + raise "LOOP can only jump -128 to 127 bytes, #{label} is #{delta} bytes away" + end + + asm do + emit_byte(0xe2) + emit_byte(delta) + end + end + alias_method :loop, :loop_ + + + # Opcode group #3. 1-byte opcode, 1 operand (r/m8 or r/m32). + # + # Members of this group are: DIV, IDIV, MUL, IMUL, NEG, NOT, and TEST. + def group3(op, reg, instruction) + opcode = + if rm?(op, 8) + 0xf6 + elsif rm?(op) + 0xf7 + else + raise "unsupported #{instruction} instruction: op=#{op.inspect}" + end + + asm do + emit_byte(opcode) + emit_modrm(op, reg) + end + end + + + end + + end + end +end diff --git a/lib/compiler/asm/x86/registers.rb b/lib/compiler/asm/x86/registers.rb new file mode 100644 index 0000000..c9668b8 --- /dev/null +++ b/lib/compiler/asm/x86/registers.rb @@ -0,0 +1,32 @@ +require 'asm/regproxy' + +module ASM + + module Registers + + # This structure allows for x86 registers of all sizes. The + # number of the register is the index of the array in which it was + # found. The size of a register in bytes is 2 ** index-into-sub-array. + Registers = [ [:al, :ax, :eax], # 0 + [:cl, :cx, :ecx], # 1 + [:dl, :dx, :edx], # 2 + [:bl, :bx, :ebx], # 3 + [:ah, :sp, :esp], # 4 + [:ch, :bp, :ebp], # 5 + [:dh, :si, :esi], # 6 + [:bh, :di, :edi] # 7 + ] + + # Setup register proxies which are used both in effective address + # calculations, and also just as symbols representing registers. + Registers.each_with_index do |group, regnum| + group.each_with_index do |reg, i| + name = reg.to_s.upcase + const_set(name, RegisterProxy.new(reg, 8 * (2 ** i), regnum)) + end + end + + + end + +end \ No newline at end of file diff --git a/lib/compiler/asm/x86/template.darwin.asm b/lib/compiler/asm/x86/template.darwin.asm new file mode 100644 index 0000000..673f104 --- /dev/null +++ b/lib/compiler/asm/x86/template.darwin.asm @@ -0,0 +1,11 @@ +BITS 32 +GLOBAL _main +SECTION .data +{data} +SECTION .bss +{bss} +SECTION .text +_main: +{code} + ;; The result in eax is the exit code, just return. + ret diff --git a/lib/compiler/asm/x86/template.linux.asm b/lib/compiler/asm/x86/template.linux.asm new file mode 100644 index 0000000..0c851d1 --- /dev/null +++ b/lib/compiler/asm/x86/template.linux.asm @@ -0,0 +1,13 @@ +BITS 32 +GLOBAL _start +SECTION .data +{data} +SECTION .bss +{bss} +SECTION .text +_start: +{code} + ;; The result in eax is the exit code, move it to ebx. + mov ebx, eax + mov eax, 1 ; _exit syscall + int 0x80 ; call Linux diff --git a/lib/compiler/asm/x86/text_assembler.rb b/lib/compiler/asm/x86/text_assembler.rb new file mode 100644 index 0000000..764a921 --- /dev/null +++ b/lib/compiler/asm/x86/text_assembler.rb @@ -0,0 +1,159 @@ +# A subset of x86 assembly. +# +# sjs +# may 2009 + +require 'compiler/asm/text_assembler' + +class Compiler + module ASM + module X86 + + # ASM methods output nasm-friendly x86 asm code, line by + # line. This is dead easy and we can trust nasm to compile + # correct machine code, which isn't trivial. + class TextAssembler < ASM::TextAssembler + + def emit_entry_point + end + + # Define a constant in the .data section. + def const(name, value) + @data << "#{name}\tequ #{value}" + end + + # Define a variable with the given name and size in bytes. + def define_var_impl(name, bytes = nil) + super(name, bytes) + dwords = bytes / 4 + @bss << "#{name}: resd #{dwords}\n" + end + + def output + File.read(template_filename). + sub("{data}", @data). + sub("{bss}", @bss). + sub("{code}", @code) + end + + def emit_label(name = label) + emit("#{name}:", tab: nil) + end + + def mov(dest, src) + emit("mov #{dest}, #{src}#{src.is_a?(Numeric) ? " ; 0x#{src.to_s(16)}" : ''}") + end + + def movzx(dest, src) + emit("movzx #{dest}, #{src}") + end + + def add(dest, src) + emit("add #{dest}, #{src}") + end + + def sub(dest, src) + emit("sub #{dest}, #{src}") + end + + def imul(op) + emit("imul #{op}") + end + + def idiv(op) + emit("idiv #{op}") + end + + def inc(op) + emit("inc #{op}") + end + + def dec(op) + emit("dec #{op}") + end + + def push(reg) + emit("push #{reg}") + end + + def pop(reg) + emit("pop #{reg}") + end + + def call(label) + emit("call #{label}") + end + + def leave + emit("leave") + end + + def neg(reg) + emit("neg #{reg}") + end + + def not(rm32) + emit("not #{rm32}") + end + + def xchg(op1, op2) + emit("xchg #{op1}, #{op2}") + end + + def and_(op1, op2) + emit("and #{op1}, #{op2}") + end + + def or(op1, op2) + emit("or #{op1}, #{op2}") + end + + def xor(op1, op2) + emit("xor #{op1}, #{op2}") + end + + def jz(label) + emit("jz #{label}") + end + + def jnz(label) + emit("jnz #{label}") + end + + def jmp(label) + emit("jmp #{label}") + end + + def jl(label) + emit("jl #{label}") + end + + def cmp(a, b) + emit("cmp #{a}, #{b}") + end + + def lea(a, b) + emit("lea #{a}, #{b}") + end + + def shr(a, b) + emit("shr #{a}, #{b}") + end + + def loop_(label) + emit("loop #{label}") + end + + def int(num) + emit("int 0x#{num.to_s(16)}") + end + + def cdq + emit("cdq") + end + + end + + end + end +end diff --git a/lib/compiler/build.rb b/lib/compiler/build.rb new file mode 100755 index 0000000..e3994f3 --- /dev/null +++ b/lib/compiler/build.rb @@ -0,0 +1,108 @@ +#!/usr/bin/env ruby + +require 'compiler' + +# usage: build.rb [output filename] [elf | macho] [asm | bin] + +BIN_FORMATS = Hash.new('bin') +BIN_FORMATS['darwin'] = 'macho' +BIN_FORMATS['linux'] = 'elf' + +def main + filename = ARGV.shift.to_s + raise "can't read #{filename}" unless File.readable?(filename) + outdir = ARGV.shift || '.' + platform = `uname -s`.chomp.downcase + binformat = ARGV[1] ? ARGV[1].downcase : BIN_FORMATS[platform] + puts "Building #{filename} for #{platform}, binformat is #{binformat} ..." + outfile = build(filename, outdir, platform, binformat) + puts outfile + exit +end + + +def error(msg) STDERR.puts(msg) end + +# name part (filename minus extension) +def base(filename) + filename.sub(/\.[^.]*$/, '') +end + + +# infile: input filename +# outfile: output filename +# asm: assembler to use +def compile(infile, outfile, asm) + + File.open(infile, 'r') do |input| + File.open(outfile, 'wb') do |out| + out.print(Compiler.compile(input, asm)) + end + end + +rescue ParseError => e + error("[error] #{e.message}") + error("[context] #{e.context}") + error(e.caller) + exit(1) +end + +def run_and_warn_on_failure(command) + output = `#{command}` + if $?.exitstatus != 0 + puts + print output + name = command.split.first + raise "#{name} failed: #{$?.exitstatus}" + end +end + +# link with ld, return resulting filename. +def link(filename, outdir, platform = 'linux') + f = base(filename) + cmd, args = *case platform + when 'darwin' + ['gcc', '-arch i386'] + when 'linux' + ['ld', ''] + else + raise "unsupported platform: #{platform}" + end + run_and_warn_on_failure("#{cmd} #{args} -o #{f} #{filename} 2>&1") + `chmod u+x #{f}` + return f +end + +def build(filename, outdir, platform = 'linux', binformat = 'macho') + objfile = File.join(outdir, base(filename) + '.o') + symtab, objwriter_class = + case binformat + when 'elf' + [Compiler::ELF::SymbolTable.new, Compiler::ELF::ObjectFile] + when 'macho' + [Compiler::MachO::SymbolTable.new, Compiler::MachO::ObjectFile] + else + raise "unsupported binary format: #{binformat}" + end + compile(filename, objfile, Compiler::ASM::Binary.new(platform, symtab, objwriter_class)) + exefile = link(objfile, outdir, platform) + return exefile +end + +def build_asm(filename, outdir, platform = 'linux', binformat = 'macho') + asmfile = File.join(outdir, base(filename) + '.asm') + compile(filename, asmfile, Compiler::ASM::Text.new(platform)) + objfile = assemble(asmfile, binformat) + exefile = link(objfile, platform) + return exefile +end + +# assemble using nasm, return resulting filename. +def assemble(filename, binformat = 'macho') + f = base(filename) + outfile = "#{f}.o" + run_and_warn_on_failure("nasm -f #{binformat} -g -o #{outfile} #{filename} 2>&1") + return outfile +end + +main if $0 == __FILE__ diff --git a/lib/compiler/parse_error.rb b/lib/compiler/parse_error.rb new file mode 100644 index 0000000..0c3a915 --- /dev/null +++ b/lib/compiler/parse_error.rb @@ -0,0 +1,14 @@ +class Compiler + + class ParseError < RuntimeError + + attr_reader :caller, :context + + def initialize(caller, context = nil) + @caller = caller + @context = context + end + + end + +end diff --git a/lib/compiler/parser.rb b/lib/compiler/parser.rb new file mode 100644 index 0000000..288ad0e --- /dev/null +++ b/lib/compiler/parser.rb @@ -0,0 +1,966 @@ +# A compiler as described by Jack Crenshaw in his famous book "Let's +# Build a Compiler". At least in the beginning, this code will +# closely reflect the Pascal code written by Jack. Over time it may +# become more idiomatic, however this is an academic exercise. +# +# sjs +# may 2009 + +require 'compiler/parse_error' + +class Compiler + + class Parser + + KEYWORDS = { + 'if' => :if_else_stmt, + 'while' => :while_stmt, + 'until' => :until_stmt, + 'repeat' => :repeat_stmt, + 'for' => :for_stmt, + 'do' => :do_stmt, + 'break' => :break_stmt, + 'print' => :print_stmt, + 'else' => nil, + 'end' => nil + } + + # Grouped by precedence. + OPS = { + :add => %w[+ -], + :mul => %w[* /], + :rel => %w[== != < > <= >=], + :or => %w[||], + :and => %w[&&], + :bit => %w[| ^ &], + :unary => %w[- +] + } + # Op chars are chars that can begin an op, so OP_CHARS needs to be a + # map of kinds of operators to a list of valid prefix chars. + OP_CHARS = OPS.inject({}) { |hash, kv| + key, val = *kv + hash[key] = val.map {|op| op[0, 1]} # slice off first char for each op + hash + # Include :all for a very general test. + }.merge(:all => OPS.values.flatten.map{|op| op[0, 1]}.sort.uniq) + + FALSE = 0 + TRUE = -1 + + attr_reader :asm + + def initialize(input, asm) + @indent = 0 # for pretty printing + @look = '' # Next lookahead char. + @token = nil # Type of last read token. + @value = nil # Value of last read token. + @input = input # Stream to read from. + @asm = asm # assembler + @keywords = KEYWORDS.clone + @keyword_names = @keywords.keys + @label_stack = [] + + # seed the lexer + get_char + end + + def parse + block # parse a block of code + expected(:'end of file') unless eof? + end + + def compile + asm.output + end + + # Scan the input stream for the next token. + def scan + return if @look.nil? # eof + + if alpha?(@look) + get_name + + elsif digit?(@look) + get_number + + elsif op_char?(@look) + get_op + + elsif newline?(@look) + skip_any_whitespace + scan + + elsif comment_char?(@look) + skip_comment + scan + + else + # XXX default to single char op... should probably raise. + @token = :op + @value = @look + get_char + end + end + + # put back the most recently parsed value + def backtrack + @input.ungetc(@look[0]) + @value.reverse.each_byte {|i| @input.ungetc(i)} + get_char + end + + # Parse and translate an identifier or function call. + def identifier + name = get_name + + if @look == '(' + # function call + match('(') + # TODO arg list + match(')') + asm.call(name) + else + # variable access + asm.load_var(name) + end + end + + # Parse and translate a single factor. Result is in eax. + def factor + if @look == '(' + match('(') + boolean_expression + match(')') + elsif alpha?(@look) + identifier # or call + elsif digit?(@look) + asm.load(get_number.to_i) + else + expected(:'integer, identifier, function call, or parenthesized expression', :got => @look) + end + end + + # Parse a signed factor. + def signed_factor + sign = @look + match(sign) if op?(:unary, sign) + factor + asm.neg(return_reg) if sign == '-' + end + + # Parse and translate a single term (factor or mulop). Result is in + # eax. + def term + signed_factor # Result in eax. + + while op?(:mul, @look) + asm.push(return_reg) + case @look + when '*' + multiply + when '/' + divide + end + end + end + + # Parse and translate a general expression of terms. Result is + # in eax. + def arithmetic_expression + term # Result is in eax. + + while op_char?(@look, :add) + asm.push(return_reg) + case @look + when '+' + add + when '-' + subtract + end + end + end + + # Parse an addition operator and the 2nd term (b). The result is + # left in eax. The 1st term (a) is expected on the stack. + def add + match('+') + term # Result is in eax. + asm.stack_add(return_reg) # Add a to b. + end + + # Parse a subtraction operator and the 2nd term (b). The result is + # left in eax. The 1st term (a) is expected on the stack. + def subtract + match('-') + term # Result, b, is in eax. + asm.neg(return_reg) # Fake the subtraction. a - b == a + -b + asm.stack_add(return_reg) # Add a to -b. + end + + # Parse an addition operator and the 2nd term (b). The result is + # left in eax. The 1st term (a) is expected on the stack. + def multiply + match('*') + signed_factor # Result, b, is in return_reg. + asm.stack_mul_signed(return_reg) # Multiply a by b. + end + + # Parse a division operator and the divisor (b). The result is + # left in eax. The dividend (a) is expected on the stack. + def divide + match('/') + signed_factor # Result is in eax. + asm.stack_div(return_reg) # Divide a by b. + end + + + ################### + # bit expressions # + ################### + + def bit_expression + arithmetic_expression + while op?(:bit, @look) + scan + case @value + when '|' + bitor_expression + when '^' + bitxor_expression + when '&' + bitand_expression + else + backtrack + return + end + end + end + + def bit_op(op, token) + asm.push(return_reg) + arithmetic_expression + asm.send("stack_#{op}", return_reg) + end + + def bitor_expression + bit_op(:or, '|') + end + + def bitxor_expression + bit_op(:xor, '^') + end + + def bitand_expression + bit_op(:and, '&') + end + + + ####################### + # boolean expressions # + ####################### + + def boolean_expression + boolean_term + while @look == '|' + scan + expected('||') unless match_word('||') + + false_label = asm.make_label(:false) + truthy_label = asm.make_label(:truthy) + done_label = asm.make_label(:done) + + asm.compare(return_reg, FALSE) + asm.jne(truthy_label) + + boolean_term + asm.compare(return_reg, FALSE) + asm.je(false_label) + + asm.define_label(truthy_label) + asm.mov_reg_imm(return_reg, TRUE) + asm.jmp(done_label) + + asm.define_label(false_label) + asm.mov_reg_imm(return_reg, FALSE) + + asm.define_label(done_label) + end + end + + def boolean_term + not_factor + while @look == '&' + scan + expected('&&') unless match_word('&&') + false_label = asm.make_label(:false) + done_label = asm.make_label(:done) + + asm.compare(return_reg, FALSE) + asm.je(false_label) + + not_factor + asm.compare(return_reg, FALSE) + asm.je(false_label) + + asm.mov_reg_imm(return_reg, TRUE) + asm.jmp(done_label) + + asm.define_label(false_label) + asm.mov_reg_imm(return_reg, TRUE) + + asm.define_label(done_label) + end + end + + def boolean_factor + if boolean?(@look) + if get_boolean == 'true' + asm.mov_reg_imm(return_reg, TRUE) + else + asm.xor(return_reg, return_reg) + end + scan + else + relation + end + end + + def not_factor + if @look == '!' + match('!') + boolean_factor + make_boolean(return_reg) # ensure it is -1 or 0... + asm.not_(return_reg) # so that 1's complement NOT is also boolean not + else + boolean_factor + end + end + + # Convert any identifier to a boolean (-1 or 0). This is + # semantically equivalent to !!reg in C or Ruby. + def make_boolean(reg) + end_label = asm.make_label(:endmakebool) + asm.compare(reg, FALSE) # if false do nothing + asm.jz(end_label) + asm.mov_reg_imm(reg, TRUE) # truthy, make it true + asm.define_label(end_label) + end + + def relation + bit_expression + if op_char?(@look, :rel) + scan + asm.push(return_reg) + + case @value + when '==' + eq_relation + when '!=' + neq_relation + when '>' + gt_relation + when '>=' + ge_relation + when '<' + lt_relation + when '<=' + le_relation + end + end + end + + # a: + # b: eax + # + # If b - a is zero then a = b, and make_boolean will leave the zero + # to effectively return false. If b - a is non-zero then a != b, + # and make_boolean will leave -1 (true) for us in eax. + def neq_relation + bit_expression + asm.stack_sub(return_reg) + make_boolean + end + + # Invert the != test for equal. + def eq_relation + neq_relation + asm.not(return_reg) + end + + # > and < are both implemented in terms of jl (jump if less than). + # We exploit the fact that cmp is the subtraction of src from dest + # and order the terms appropriately for each function. As for >= + # and <=, they in turn are implemented in terms of > and <. a is + # greater than or equal to b if and only if a is *not* less than b. + # + # Note: This was done to minimize the number of instructions that + # the assembler needed to implement, but since the Jcc + # instructions are very cheap to implement this is no longer + # a concern. + + + # The next 4 relations all compare 2 values a and b, then return + # true (-1) if the difference was below zero and false (0) + # otherwise (using JL, jump if less than). + def cmp_relation(a, b, options = {}) + bit_expression + asm.pop(EBX) + + # Invert the sense of the test? + invert = options[:invert] + + true_label = asm.make_label(:cmp) + end_label = asm.make_label(:endcmp) + asm.compare(a, b) + asm.jl(true_label) + + asm.mov(EAX, FALSE) # return false + asm.not_(EAX) if invert # (or true if inverted) + asm.jmp(end_label) + + asm.define_label(true_label) + asm.mov(EAX, FALSE) # return true + asm.not_(EAX) unless invert # (or false if inverted) + + asm.define_label(end_label) + end + + # a: + # b: eax + # + # if a > b then b - a < 0 + def gt_relation + TODO: fix this + cmp_relation(EAX, EBX) # b - a + end + + # a: + # b: eax + # + # if a < b then a - b < 0 + def lt_relation + cmp_relation(EBX, EAX) # a - b + end + + # a: + # b: eax + # + # if a >= b then !(a < b) + def ge_relation + # Compare them as in less than but invert the result. + cmp_relation(EBX, EAX, :invert => true) + end + + # a: + # b: eax + # + # if a <= b then !(a > b) + def le_relation + # Compare them as in greater than but invert the result. + cmp_relation(EAX, EBX, :invert => true) + end + + + ###################################### + # statements and controls structures # + ###################################### + + def keyword + unless action = @keywords[@value] + raise "unsupported keyword: #{@value}" + end + send(action) + end + + # Parse an assignment statement. Value is in eax. + def assignment + name = @value + match('=') + boolean_expression + lval = asm.var!(name) + asm.store_var(lval, return_reg) + end + + # Parse a code block. + def block + @indent += 1 + + # scan a token, type ends up in @token and value in @value + scan + + until @value == 'else' || @value == 'end' || eof? + if @token == :keyword + keyword + else + assignment + end + + scan + end + + @indent -= 1 + end + + # Parse an if-else statement. + def if_else_stmt + else_label = asm.make_label(:end_or_else) + end_label = else_label # only generated if else clause + # present + condition + skip_any_whitespace + asm.jz(else_label) + block + if @token == :keyword && @value == 'else' + skip_any_whitespace + end_label = asm.make_label(:endif) # now we need the 2nd label + asm.jmp(end_label) + asm.define_label(else_label) + block + end + match_word('end') + asm.define_label(end_label) + end + + # Used to implement the Two-Label-Loops (while, until, repeat). + # + # name: Name of the loop for readable labels. + # block: Code to execute at the start of each iteration. (e.g. a + # condition) + def simple_loop(name) + start_label = asm.make_label(:"#{name}_loop") + end_label = asm.make_label(:"end_#{name}") + asm.define_label(start_label) + yield(end_label) + pushing_label(end_label) { block } + match_word('end') + asm.jmp(start_label) + asm.define_label(end_label) + end + + def condition_loop(name, jump_instruction) + simple_loop(name) do |end_label| + condition + skip_any_whitespace + asm.send(jump_instruction, end_label) + end + end + + def while_stmt + condition_loop('while', :jz) # done when == 0 (falsish) + end + + def until_stmt + condition_loop('until', :jnz) # done when != 0 (truthy) + end + + def repeat_stmt + simple_loop('repeat') do |end_label| + skip_any_whitespace + end + end + + # s = 0 + # f x = 1 to 5 + # s = s + x + # e + def for_stmt + name = get_name + counter = asm.define_var(name) + match('=') + + boolean_expression # initial value + + asm.sub(return_reg, 1) # pre-decrement because of the + # following pre-increment + + asm.mov([counter], EAX) # stash the counter in memory + match_word('to', :scan => true) + + boolean_expression # final value + skip_any_whitespace + + asm.push(EAX) # stash final value on stack + final = [ESP] + + simple_loop('for') do |end_label| + asm.mov(ECX, [counter]) # get the counter + asm.add(ECX, 1) # increment + asm.mov([counter], ECX) # store the counter + asm.cmp(final, ECX) # check if we're done + asm.jz(end_label) # if so jump to the end + end + + asm.add(ESP, 4) # clean up the stack + end + + # do 5 + # ... + # end + def do_stmt + + boolean_expression + skip_any_whitespace + asm.mov(ECX, EAX) + + start_label = asm.make_label(:do) + end_label = asm.make_label(:enddo) + asm.define_label(start_label) + + asm.push(ECX) + + pushing_label(end_label) { block } + + asm.pop(ECX) + + match_word('end') + asm.dec(ECX) + asm.jnz(start_label) + + # Phony push! break needs to clean up the stack, but since we + # don't know if there is a break at this point we fake a push and + # always clean up the stack after. + asm.sub(ESP, 4) + + asm.define_label(end_label) + + # If there was a break we have to clean up the stack here. If + # there was no break we clean up the phony push above. + asm.add(ESP, 4) + end + + def break_stmt + if top_label + asm.jmp(top_label) + else + expected(:'break to be somewhere useful', + :got => :'a break outside a loop') + end + end + + # Evaluates any expression for now. There are no boolean operators. + def condition + boolean_expression + skip_whitespace + asm.cmp(EAX, 0) # 0 is false, anything else is true + end + + # print eax in hex format + def print_stmt + # variables + d = '__DIGITS' + h = '__HEX' + + digits = if asm.var?(d) + asm.var(d) + else + d_var = asm.define_var(d, 16) + asm.block do + # define a lookup table of digits + mov([d_var], 0x33323130) + mov([d_var+4], 0x37363534) + mov([d_var+8], 0x62613938) + mov([d_var+12], 0x66656463) + end + d_var + end + + # 12 bytes: 2 for "0x", 8 hex digits, 2 for newline + null terminator + hex = asm.var!(h, 12) + + asm.block do + # TODO check sign and prepend '-' if negative + mov([hex], 0x7830) # "0x" ==> 0x30 (48), 0x78 (120) + mov([hex+4], 0) # zero the rest + mov([hex+8], 0) + mov([:byte, hex+10], 0xa) # newline + mov([:byte, hex+11], 0) # null terminator + end + boolean_expression # result in EAX + asm.block do + # convert eax to a hex string + lea(ESI, [digits]) + lea(EDI, [hex+9]) + # build the string backwards (right to left), byte by byte + mov(ECX, 4) + end + asm.block do + define_label(loop_label = make_label) + # low nybble of nth byte + movzx(EBX, AL) + and_(BL, 0x0f) # isolate low nybble + movzx(EDX, [:byte, ESI+EBX]) + mov([EDI], DL) + dec(EDI) + # high nybble of nth byte + movzx(EBX, AL) + and_(BL, 0xf0) # isolate high nybble + shr(BL, 4) + mov(DL, [ESI+EBX]) + mov([EDI], DL) + dec(EDI) + shr(EAX, 8) + loop_(loop_label) + # write(int fd, char *s, int n) + mov(EAX, 4) # SYS_write + lea(ECX, [hex]) # ecx = &s + args = [1, # fd = 1 (STDOUT) + ECX, # s = &s + 11] # n = 11 (excluding term, max # of chars to print) + case platform + when 'darwin' # on the stack, right to left (right @ highest addr) + #### + # setup bogus stack frame + push(EBP) + mov(EBP, ESP) + sub(ESP, 36) + #### + args.reverse.each { |a| push(a) } + push(EAX) + int(0x80) + #### + # teardown bogus stack frame + xor(EAX, EAX) + add(ESP, 36) + pop(EBX) + leave + #### + when 'linux' + mov(EBX, args[0]) + mov(ECX, args[1]) + mov(EDX, args[2]) + int(0x80) + end + end + end + + + ############ + # internal # + ############ + + + def eof? + @input.eof? && @look.nil? + end + + def op_char?(char, kind = :all) + OP_CHARS[kind].include?(char) + end + + def op?(kind, token) + OPS[kind].include?(token) + end + + # Read the next character from the input stream. + def get_char + @look = @input.readbyte.chr unless @input.eof? + end + + # Report what was expected + def expected(what, options = {}) + got = options.has_key?(:got) ? options[:got] : @value + got, what = *[got, what].map {|x| x.is_a?(Symbol) ? x : "'#{x}'" } + if eof? + raise ParseError.new(caller), "Premature end of file, expected: #{what}." + else + context = (@input.readline rescue '(EOF)').gsub("\n", "\\n") + raise ParseError.new(caller, context), "Expected #{what} but got #{got}." + end + end + + + + # Recognize an alphabetical character. + def alpha?(char) + ('A'..'Z') === char.upcase + end + + # Recognize a decimal digit. + def digit?(char) + ('0'..'9') === char + end + + # Recognize an alphanumeric character. + def alnum?(char) + alpha?(char) || digit?(char) || char == '_' + end + + # XXX disabled! ... should treat true/false as constants + # once again we need a token of lookahead + def boolean?(char) + #char == 't' || char == 'f' + false + end + + def whitespace?(char) + char == ' ' || char == "\t" + end + + def newline?(char) + char == "\n" || char == "\r" + end + + def comment_char?(char) + char == '#' + end + + def any_whitespace?(char) + whitespace?(char) || newline?(char) + end + + # Parse one or more newlines. + def get_newline + expected(:newline, :got => @look) unless newline?(@look) + many(:newline?) + @token = :newline + @value = "\n" + end + + # Match literal input. + def match(char) + expected(char, :got => @look) unless @look == char + # puts "[ch] #{indent}#{char}" + get_char + skip_whitespace + end + + # Match literal input. + def match_word(word, options = {}) + scan if options[:scan] + match = @value == word + expected(word) unless match + match + end + + # Parse zero or more consecutive characters for which the test is + # true. + def many(test) + test = method(test) if test.is_a?(Symbol) + token = '' + while !eof? && test[@look] + token << @look + get_char + end + skip_whitespace + token + end + + # Parse a "name" (keyword or identifier). + def get_name + expected(:identifier) unless alpha?(@look) + @value = many(:alnum?) + @token = @keyword_names.include?(@value) ? :keyword : :identifier + @value + end + + # Parse a number. + def get_number + expected(:integer) unless digit?(@look) + @token = :number + @value = many(:digit?) + # puts "[nu] #{indent}#{@value} (0x#{@value.to_i.to_s(16)})" + @value + end + + def get_boolean + get_name + expected(:boolean) unless @value == 'true' || @value == 'false' + @token = :boolean + # puts "[bo] #{indent}#{@value}" + @value + end + + def get_op + expected(:operator) unless op_char?(@look) + @token = :op + @value = many(:op_char?) + end + + # Skip leading whitespace. + def skip_whitespace + get_char while whitespace?(@look) + end + + # Skip leading whitespace including newlines. + def skip_any_whitespace + get_char while any_whitespace?(@look) + end + + def skip_comment + get_char until newline?(@look) + skip_any_whitespace + end + + + def indent + real_indent = if @value == 'else' || @value == 'end' + @indent - 1 + else + @indent + end + ' ' * (real_indent * 4) + end + + def pushing(reg) + asm.push(reg) + yield + asm.add(ESP, 4) + end + + def print_token + print(case @token + when :keyword + '[kw] ' + when :number + '[nu] ' + when :identifier + '[id] ' + when :op + '[op] ' + when :boolean + '[bo] ' + when :newline + '' + else + raise "print doesn't know about #{@token}: #{@value}" + end) + print indent + puts @value + end + + def pushing_label(label) + push_label(label) + yield + pop_label + end + + def push_label(label) + @label_stack.push(label) + end + + def top_label + @label_stack[-1] + end + + def pop_label + @label_stack.pop + end + + + # for debugging + def self.hook(callback, methods) + methods.each do |m| + orig = :"orig_#{m}" + alias_method orig, m + define_method(m) do + val = send(orig) + send(callback) + val + end + end + end + + # hook(:print_token, [:get_name, :get_newline, :get_number, :get_op, :get_boolean]) + + end +end