From bc6a3d4d3b33aa1e8c76259f055875eb4bd36695 Mon Sep 17 00:00:00 2001 From: Sami Samhuri Date: Thu, 25 Jun 2009 09:42:56 -0700 Subject: [PATCH] [NEW] Binary assembler outputs working machine code and Mach-O object files that can be linked into working executables. --- Makefile | 13 - asm/asm.rb | 31 +- asm/binary.rb | 918 +++++++++++++++++++++++++++++++-------------- asm/cstruct.rb | 320 ++++++++++++++++ asm/elfsymtab.rb | 7 + asm/elfwriter.rb | 9 + asm/macho.rb | 164 ++++++++ asm/machofile.rb | 364 ++++++++++++++++++ asm/machosym.rb | 29 ++ asm/machosymtab.rb | 77 ++++ asm/machowriter.rb | 26 ++ asm/objwriter.rb | 26 ++ asm/registers.rb | 32 ++ asm/regproxy.rb | 67 ++++ asm/symtab.rb | 89 +++++ asm/text.rb | 21 +- build.rb | 82 ++-- compiler.rb | 279 +++++++------- elfwriter.c | 288 -------------- lea.asm | 12 - mov.asm | 89 ----- test/Makefile | 2 +- test/test.rb | 18 +- test/test_for.code | 2 - x86.txt | 11 - 25 files changed, 2082 insertions(+), 894 deletions(-) create mode 100644 asm/cstruct.rb create mode 100644 asm/elfsymtab.rb create mode 100644 asm/elfwriter.rb create mode 100644 asm/macho.rb create mode 100644 asm/machofile.rb create mode 100644 asm/machosym.rb create mode 100644 asm/machosymtab.rb create mode 100644 asm/machowriter.rb create mode 100644 asm/objwriter.rb create mode 100644 asm/registers.rb create mode 100644 asm/regproxy.rb create mode 100644 asm/symtab.rb delete mode 100644 elfwriter.c delete mode 100644 lea.asm delete mode 100644 mov.asm delete mode 100644 x86.txt diff --git a/Makefile b/Makefile index 8e0b3f0..01f08b9 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,4 @@ test: cd test && make all -elfwriter: elfwriter.c - gcc -o elfwriter elfwriter.c -lelf - -test_elf: elfwriter build - ./elfwriter test.bin 4 test_elf.o - ld -o test_elf test_elf.o - ./test_elf - -clean: - @rm -f elfwriter - @rm -f test_elf.o - @rm -f test_elf - .PHONY: test diff --git a/asm/asm.rb b/asm/asm.rb index b7037d0..3465cc9 100644 --- a/asm/asm.rb +++ b/asm/asm.rb @@ -5,6 +5,8 @@ # sjs # may 2009 +require 'asm/registers' + module Assembler # Abstract class for common functionality between different code @@ -14,41 +16,14 @@ module Assembler attr_reader :platform - def initialize(platform='linux', *args) + def initialize(platform) @platform = platform - @vars = {} # Symbol table, maps names to locations in BSS. - @num_labels = 0 # Used to generate unique labels. - @num_labels_with_suffix = Hash.new(0) - - # Maps names to locations. - @labels = Hash.new {|h, key| raise "undefined label: #{key}"} - end def block(*args, &blk) instance_eval(&blk) end - def output - raise "#{self.class} is supposed to implement this method!" - end - - def var(name) - @vars[name] - end - alias_method :var?, :var - - # Generate a unique label. - def label(suffix=nil) - @num_labels += 1 - if suffix - @num_labels_with_suffix[suffix] += 1 - suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}" - end - name = "L#{sprintf "%06d", @num_labels}#{suffix}" - return name - end - end end diff --git a/asm/binary.rb b/asm/binary.rb index 154c72e..ac4c4a3 100644 --- a/asm/binary.rb +++ b/asm/binary.rb @@ -5,359 +5,298 @@ # sjs # may 2009 -ROOT = __FILE__.sub(/\/asm\/binary\.rb$/, '') unless defined? ROOT -$LOAD_PATH << ROOT unless $LOAD_PATH.include?(ROOT) - require 'asm/asm' module Assembler - # Define a method named `emit_byte` and one named `binary_size` and - # include this module. Calling the assembler methods will output - # x86 machine code ... hopefully. So far it's incomplete and - # binaries just segfault. class Binary < AssemblerBase - # This structure allows for x86 registers of all sizes. The - # number of the register is the index of the array in which it was - # found. - Registers = [ [:eax, :ax, :al], # 0 - [:ecx, :cx, :cl], # 1 - [:edx, :dx, :dl], # 2 - [:ebx, :bx, :bl], # 3 - [:esp, :sp, :ah], # 4 - [:ebp, :bp, :ch], # 5 - [:esi, :si, :dh], # 6 - [:edi, :di, :bh] # 7 - ] + include Registers - # Regex to match any x86 register name. - RegisterRegex = '(e?[acdb]x|e?[sb]p|e?[sd]i|[acdb][hl])' + DEBUG_OUTPUT = false - # Match a literal number in binary, octal, decimal, or hex - NumberRegex = '(0[xXbB]?)?[0-9a-fA-F]+' - - # Match a variable name. - NameRegex = '[a-zA-Z][a-zA-Z0-9]*' - - # 0.size gives the real answer, we only do x86 though + # 0.size gives the real answer, we only do x86-32 though MachineBytes = 4 MachineBits = MachineBytes * 8 MinSigned = -1 * 2**(MachineBits-1) MaxSigned = 2**(MachineBits-1) - 1 MinUnsigned = 0 MaxUnsigned = 2**MachineBits - 1 - SignedRange = MinSigned..MaxSigned + SignedInt = MinSigned..MaxSigned + SignedByte = -128..127 - X86_exit = { - 'linux' => [0x89, 0xc3, # mov ebx, eax (exit code) - 0xb8, 1, 0, 0, 0, # mov eax, 1 - 0xcd, 0x80 # int 0x80 - ].pack('c*'), + # This is used for encoding instructions. Just as the generated asm + # contains "BITS 32", binary is generated for 32-bit protected mode. + DefaultOperandSize = :dword - 'darwin' => [0x50, # push eax (exit code) - 0xb8, 1, 0, 0, 0, # mov eax, 1 - 0xcd, 0x80 # int 0x80 - ].pack('c*') + SizeMap = {:byte => 8, :word => 16, :dword => 32} + + X86_start = { + 'linux' => [], + 'darwin' => [ 0x55, # push ebp + 0x89, 0xe5, # mov ebp, esp + 0x81, 0xec, 8, 0, 0, 0 # sub esp, 8 + ] } - def initialize(platform='linux', binformat='elf') - super + X86_exit = { + 'linux' => [ 0x89, 0xc3, # mov ebx, eax (exit code) + 0xb8, 1, 0, 0, 0, # mov eax, 1 + 0xcd, 0x80 # int 0x80 + ], + + 'darwin' => [ 0xc9, # leave + 0xc3 # ret + ] + } + + attr_reader :eip + + def initialize(platform, symtab, objwriter) + super(platform) + @symtab = symtab + @objwriter = objwriter @binary = [] # Byte array of machine code. - @platform = platform - @binformat = binformat - init_sections - end - - def init_sections - case @platform - - when 'linux' - raise "unsupported" unless @binformat == 'elf' - @header_size = 0x100 # ELF, Linux - @text_offset = 0x08048000 + @header_size # Offset of text section in memory - - when 'darwin' - raise "unsupported" unless @binformat == 'macho' - @header_size = 0x100 # Mach-O, Darwin - @text_offset = 0x08048000 + @header_size # Offset of text section in memory - else - raise "unsupported platform: #{platform}" - end - @text_size = 0x02be00 # Size of text section. - @data_offset = @text_offset + @text_size # Offset of data section. - @data_size = 0x4e00 # Size of data section. - @bss_offset = @data_offset + @data_size # Offset of bss section. - @bss_size = 0 # Size of bss section. + @eip = 0 # Our instruction pointer, or the number of bytes written. + + # Always include the _main entry point in our symbol table. It begins at the + # beginning of the __TEXT segment, 0x0. + @symtab.deflabel('_main', @eip) end def output - @binary.pack('c*') + X86_exit[@platform] + resolve_labels + blobs = X86_start[@platform] + @binary + X86_exit[@platform] + binary = blobs.pack('c*') + @objwriter.text(binary) + @objwriter.const(@symtab.const_data) + @objwriter.bss(@symtab.bss_size) + @objwriter.symtab(@symtab) + @objwriter.serialize end - # Define a constant in the .data section. - def const(name, value) - raise "unimplemented!" + def resolve_labels + bytes_read = 0 + @binary.each_with_index do |x, i| + if x.is_a?(Numeric) + bytes_read += 1 + + elsif addr?(x) + @binary[i, 1] = x[1..-1] + bytes_read += 1 + + else # label to resolve + # the actual eip points to the next instruction already, so should we. + real_eip = bytes_read + 4 + addr = @symtab.lookup_label(x) - real_eip # dest - src to get relative addr + puts "resolved label: #{x} = 0x#{@symtab.lookup_label(x).to_s(16)} (rel: 0x#{addr.to_s(16)}, eip = 0x#{real_eip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT + @binary[i, 1] = num_to_quad(addr) + # count the first byte just written, the rest are counted normally + bytes_read += 1 + end + end end - # Define a variable with the given name and size (in dwords). - def defvar(name, dwords=1) - unless var?(name) - @vars[name] = @bss_size - @bss_size += dwords + def addr?(x) + x.is_a?(Array) && x[0] == :addr + end + + def addr_size(addr) + addr.length - 1 + end + + def defconst(name, bytes, value) + @symtab.defconst(name, bytes, value) + end + + # Define a variable with the given name and size in bytes. + def defvar(name, bytes=4) + unless @symtab.var?(name) + @symtab.defvar(name, bytes) else STDERR.puts "[warning] attempted to redefine #{name}" end end - def label(suffix=nil) - name = super - @labels[name] = bytes_written - return name + # These methods are all delegated to the symbol table. + %w[var var? const const?].each do |method| + define_method(method) do |name| + @symtab.send(method, name) + end end + # Count the bytes that were encoded in the given block. def asm # stash the current number of bytes written - instruction_offset = bytes_written + instruction_offset = @eip + print "0x#{@eip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT + yield # return the number of bytes written - bytes_written - instruction_offset + @eip - instruction_offset + + puts if DEBUG_OUTPUT end + def emit_byte(byte) + + ##### The joke's on me! Array#pack('c*') already does this. It is nice to see + # in the debugging output though, so it stays for now. + # + # Convert negative native ints into signed bytes. + # + # Calculate the signed byte as the difference between -1 (0xff) and some + # number, X. When byte == -1 we want X == 0, so X == -byte - 1. + # Since -byte == ~byte + 1, then -byte - 1 == ~byte + 1 - 1 == ~byte, + # and X == ~byte. We want the *signed byte* -1, so we use 0xff, + # *not* -1. Ruby sees our signed bytes as positive ints 0-255. + # + byte = 0xff - ~byte if byte < 0 && byte >= -128 + + # make sure it's a byte + raise "not a byte: #{byte.inspect}" unless byte == byte & 0xff + + byte = byte & 0xff + ### end of pointless code + + print (byte >= 0 && byte < 0x10 ? '0' : '') + byte.to_s(16) + ' ' if DEBUG_OUTPUT + @binary << byte + @eip += 1 end - def bytes_written - @binary.size + def emit_addr(addr) + @eip += addr.length + addr.insert(0, :addr) + puts addr.inspect if DEBUG_OUTPUT + @binary << addr end - def emit_label(name=label) - @labels[name] = @binary.length + def emit_future_addr(label) + print "<#{label}> " if DEBUG_OUTPUT + @binary << label + @eip += 4 # all jumps are 32-bit relative for now end def emit_dword(num) - num_to_quad(num).each {|byte| emit_byte(byte)} + num_to_quad(num).each { |byte| emit_byte(byte) } + end + + def mklabel(suffix=nil) + @symtab.unique_label(suffix) end - # 0-2: r/m - # 3-5: reg/opcode - # 6-7: mod - # - # dest and src are tuples of the form [type, value] where type is - # any of :reg, :rm32, :imm32. Max _one_ :rm32 arg per call. - def emit_modrm(dest, src, override) - if dest[0] == :reg - reg = override[:op] || regnum(dest[1]) + def emit_label(name) + puts "\n#{name} (0x#{@eip.to_s(16)}):" if DEBUG_OUTPUT + @symtab.deflabel(name, @eip) + end - # mod == 11 (register content) - if src[0] == :reg - mod = 3 - rm = regnum(src[1]) + def emit_modrm(addr, reg=0) + mod = 0 + rm = 0 + disp8 = nil + disp32 = nil + sib = nil - # mod == 00 (pointer) - elsif src[0] == :rm32 - mod = 0 - parts = decode_addr(src[1]) - rm = case parts[0] - # mod == 00 (direct pointer e.g. [eax]) - when :reg - regnum(parts[1]) - when :sib - sib = parts[1..-1] - 4 - when :disp - disp = parts[1] - 5 - end + # memory location / pointer + if addr.is_a?(Array) + eff_addr = addr[1] || addr[0] # works with or without size prefix + raise "invalid effective address: #{addr.inspect}" unless eff_addr + case eff_addr + when RegisterProxy + + # mod == 00 + if eff_addr.register? + + # TODO check for ebp / disp32 special case and use [ebp+0] + rm = eff_addr.regnum + + elsif eff_addr.index? && eff_addr.index.is_a?(Numeric) + + # disp8, mod == 01 + if SignedByte === eff_addr.index + mod = 1 + disp8 = eff_addr.index + + # disp32, mod == 10 + elsif SignedRange === eff_addr.index + mod = 2 + disp32 = eff_addr.index + + else + raise "address must fit in 32 bits, this doesn't: #{eff_addr.index}" + end + + elsif eff_addr.index? + # scale-index-base, mod == 00 and rm == 100 + rm = 4 + sib = mk_sib(eff_addr.scale || 1, eff_addr.index, eff_addr.base) + + else + # TODO support scale-index-base byte + raise "unsupported effective address: #{addr.inspect}" + end + + # disp32, mod == 0 + when Numeric + rm = 5 # 101 + disp32 = eff_addr + + else + raise "unsupported effective address: #{addr.inspect}" end - elsif src[0] == :reg - reg = override[:op] || regnum(src[1]) + + # register content, mod == 11 + elsif addr.register? + mod = 3 + rm = addr.regnum + else - raise "unsupported mod r/m byte! dest=#{dest} src=#{src}" + raise "unsupported effective address: #{addr.inspect}" end - emit_byte((mod << 6) & (reg << 3) & rm) - emit_sib(sib) if defined? sib - emit_dword(disp) if defined? disp + + emit_byte((mod << 6) | (reg << 3) | rm) + emit_byte(sib) if sib + emit_addr([disp8]) if disp8 + emit_addr(num_to_quad(disp32)) if disp32 end - def emit_sib(sib) - scale, index, base = *sib + + def mk_sib(scale, index, base) if [1,2,4,8].include?(scale) - scale = log2(scale) + scale = log2(scale).to_i else raise "unsupported SIB scale: #{scale}, should be [1, 2, 4, 8]" end - emit_byte((scale << 6) & (index << 3) & base) - end - - def register?(op) - Registers.each_with_index { |list,i| return i if list.include?(op) } - nil - end - - def regnum(op) - num = register?(op) - raise "not a register: #{op.inspect}" unless num - num - end - - def immediate?(op) - op.is_a?(Numeric) || (op.is_a?(String) && op.match(/^#{NumberRegex}$/)) - end - - def rm32?(op) - offset?(op) || op.respond_to?(:match) && op.match(/^ - \[ - #{RegisterRegex} # base register - (\+#{RegisterRegex} # optional index register - (\*[1248])? # optional scale - )? - \] - $/x) - end - - # 6 versions of the mov instruction are supported: - # 1. mov reg32, immediate32 (0xb8+destreg, imm32) - # 2. mov reg32, r/m32 (0x8b, mod r/m, maybe sib) - # 2a. mov eax, memoffset32 (0xa1, disp32) - # 3. mov r/m32, reg32 (0x89, mod r/m, maybe sib) - # 3a. mov memoffset32, eax (0xa3, disp32) - # 4. mov r/m32, immediate32 (0xc7, mod r/m, maybe sib, imm32) - def mov(dest, src) - dest = dest[6..-1] if dest.is_a?(String) && dest[0..5] == 'dword ' - src = src[6..-1] if src.is_a?(String) && src[0..5] == 'dword ' - - asm do - - # version 1: mov r32, imm32 - if register?(dest) && immediate?(src) - emit_byte(0xb8 + regnum(dest)) # dest encoded in instruction - emit_dword(parse_num(src)) - - # version 2: mov r32, r/m32 - elsif register?(dest) && rm32?(src) - # version 2a: mov eax, moffs32 - if dest == :eax && offset?(src) - emit_byte(0xa1) - num = decode_addr(src)[1] - emit_dword(num) - else - emit_byte(0x8b) - emit_modrm([:reg, dest], [:rm32, src]) - end - - # version 3: mov r/m32, r32 - elsif rm32?(dest) && register?(src) - # version 3a: mov moffs32, eax - if offset?(dest) && src == :eax - emit_byte(0xa3) - num = decode_addr(dest)[1] - emit_dword(num) - else - emit_byte(0x89) - emit_modrm([:rm32, dest], [:reg, src]) - end - - # version 4: mov r/m32, imm32 - elsif rm32?(dest) && immediate?(src) - emit_byte(0xc7) - emit_modrm([:rm32, dest], [:imm32, src], :op => 0) - else - puts "rm32?(dest): #{rm32?(dest)}\t\trm32?(src): #{rm32?(src)}" - puts "register?(dest): #{register?(dest)}\t\tregister?(src): #{register?(src)}" - puts "immediate?(dest): #{immediate?(dest)}\t\timmediate?(src): #{immediate?(src)}" - puts "offset?(dest): #{offset?(dest)}\t\toffset?(src): #{offset?(src)}" - #raise "unsupported mov format: mov #{dest}, #{src}" - puts "!!! unsupported mov format: mov #{dest}, #{src}" - end - - end # asm do - + (scale << 6) | (index.regnum << 3) | base.regnum end - def add(dest, src) + def register?(op, size=DefaultOperandSize) + op.is_a?(RegisterProxy) && op.size == size || op.size == SizeMap[size] end - def sub(dest, src) + def immediate?(op, size=DefaultOperandSize) + bits = SizeMap[size] || size + op.is_a?(Numeric) && op >= -(2 ** bits / 2) && op <= (2 ** bits - 1) end - def imul(op) + def rm?(op, size=DefaultOperandSize) + register?(op, size) || op.is_a?(Array) && (op.size == 1 || op[0] == size) end - def idiv(op) + def offset?(addr, size=DefaultOperandSize) + addr.is_a?(Array) && addr[0].is_a?(Numeric) end - def inc(op) - asm do - if register?(op) - emit_byte(0x40 + regnum(op)) - elsif rm32?(op) - emit_byte(0xff) -# emit_modrm(...) - else - raise "unsupported op #{op}, wanted r32 or r/m32" - end - end + def constant?(op) + immediate?(op) || offset?(op) end - def push(reg) - end - - def cmp(a, b) - end - - - def offset?(addr) - addr.respond_to?(:match) && addr.match(/^\[(#{NameRegex}|#{NumberRegex})\]$/) - end - - def decode_addr(addr) - addr = addr[1..-2] # strip brackets - - if matches = addr.match(/^#{NameRegex}$/) - unless loc = @vars[matches[0]] - raise "undefined variable #{matches[0]}" - end - [:disp, @bss_offset + loc] - elsif matches = addr.match(/^#{NumberRegex}$/) - [:disp, parse_num(matches[0])] - elsif addr.index('*') - bi, scale = *addr.split('*') - base, index = *bi.split('+') - [:sib, scale.to_i, index.to_sym, base.to_sym] - elsif addr.index('+') - base, index = *addr.split('+') - [:sib, 1, index.to_sym, base.to_sym] - else - [:reg, addr.to_sym] - end - end - - # Parse a number from a string. Used by emit_dword. - def parse_num(str) - # If it's not a string it's a number, just return it. - return str unless str.is_a?(String) - - str.downcase! - base = 10 # default to base 10 - if str[0, 1] == '0' - base = case str[1, 1] - when 'x' - 16 - when 'b' - str.slice!(2..-1) - 2 - else - 8 - end - end - str.to_i(base) - end # Convert a number to a quad of bytes, discarding excess bits. # Little endian! @@ -396,6 +335,431 @@ module Assembler result end - end # module Binary + + # 8 versions of the mov instruction are supported: + # 1. mov reg32, immediate32 + # 2a. mov reg32, r/m32 + # 2b. mov eax, memoffset32 + # 3a. mov r/m32, reg32 + # 3b. mov memoffset32, eax + # 4. mov r/m32, immediate32 + # 5. mov r/m8, reg8 + # 6. mov reg8, r/m8 + def mov(dest, src) + + # These 2 are used in the same way, just the name differs to make the + # meaning clear. They are 4-byte values that are emited at the end if + # they are non-nil. Only one of them will be emited, and if both are + # non-nil that one is immediate. + immediate = nil + offset = nil + + # This is an array of arguments to be passed to emit_modrm, if it is set. + modrm = nil + + # version 1: mov r32, imm32 + if register?(dest) && immediate?(src) + opcode = 0xb8 + dest.regnum # dest encoded in instruction + immediate = src + + # version 2a: mov r32, r/m32 + elsif register?(dest) && rm?(src) + # version 2b: mov eax, moffs32 + if dest == EAX && offset?(src) + opcode = 0xa1 + offset = src[0] + else + opcode = 0x8b + modrm = [src, dest.regnum] + end + + # version 3a: mov r/m32, r32 + elsif rm?(dest) && register?(src) + # version 3b: mov moffs32, eax + if offset?(dest) && src == EAX + opcode = 0xa3 + offset = dest[0] + else + opcode = 0x89 + modrm = [dest, src.regnum] + end + + # version 4: mov r/m32, imm32 + elsif rm?(dest) && immediate?(src) + opcode = 0xc7 + modrm = [dest, 0] + immediate = src + + # version 5: mov r/m8, r8 + elsif rm?(dest, :byte) && register?(src, :byte) + opcode = 0x88 + modrm = [dest, src.regnum] + + # version 6: mov r8, r/m8 + elsif register?(dest, :byte) && rm?(src, :byte) + opcode = 0x8a + modrm = [src, dest.regnum] + + else + # puts "rm?(dest): #{rm?(dest)}\t\trm?(src): #{rm?(src)}" + # puts "register?(dest): #{register?(dest)}\t\tregister?(src): #{register?(src)}" + # puts "immediate?(dest): #{immediate?(dest)}\t\timmediate?(src): #{immediate?(src)}" + # puts "offset?(dest): #{offset?(dest)}\t\toffset?(src): #{offset?(src)}" + raise "unsupported MOV instruction, #{dest.inspect}, #{src.inspect}" + end + + asm do + emit_byte(opcode) + emit_modrm(*modrm) if modrm + emit_dword(immediate || offset) if immediate || offset + end + end + + + def movzx(dest, src) + + # movzx Gv, ?? + if register?(dest) + + opcode = case + when rm?(src, :byte): 0xb6 # movzx Gv, Eb + when rm?(src, :word): 0xb7 # movzx Gv, Ew + else + raise "unsupported MOVZX instruction, dest=#{dest.inspect} << src=#{src.inspect} >>" + end + asm do + emit_byte(0x0f) + emit_byte(opcode) + emit_modrm(src, dest.regnum) + end + + else + + raise "unimplemented MOVZX instruction, << dest=#{dest.inspect} >> src=#{src.inspect}" + end + end + + + def add(dest, src) + # add r/m32, imm8 + if rm?(dest) && immediate?(src, :byte) + asm do + emit_byte(0x83) + emit_modrm(dest, 0) + emit_byte(src) + end + + # add r/m32, imm32 + elsif rm?(dest) && immediate?(src) + asm do + emit_byte(0x81) + emit_modrm(dest, 0) + emit_dword(src) + end + + # add eax, imm32 + elsif dest == EAX && immediate?(src) + asm do + emit_byte(0x05) + emit_dword(src) + end + + # add reg32, r/m32 + elsif register?(dest) && rm?(src) + asm do + emit_byte(0x03) + emit_modrm(src, dest.regnum) + end + + else + raise "unsupported ADD instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + def sub(dest, src) + # sub r/m32, imm8 + if rm?(dest) && immediate?(src, :byte) + asm do + emit_byte(0x83) + emit_modrm(dest, 5) + emit_byte(src) + end + + # sub r/m32, imm32 + elsif rm?(dest) && immediate?(src) + asm do + emit_byte(0x81) + emit_modrm(dest, 5) + emit_dword(src) + end + + # sub r/m32, reg32 + elsif rm?(dest) && register?(src) + asm do + emit_byte(0x29) + emit_modrm(dest, src.regnum) + end + + # sub reg32, r/m32 + elsif register?(dest) && rm?(src) + asm do + emit_byte(0x2b) + emit_modrm(src, dest.regnum) + end + + else + raise "unsupported SUB instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + def imul(op) + raise "unimplemented" + asm do + end + end + + + def idiv(op) + raise "unimplemented" + asm do + end + end + + + def inc(op) + asm do + if register?(op) + emit_byte(0x40 + regnum(op)) + elsif rm?(op) + # emit_byte(0xff) + raise "unimplemented" + else + raise "unsupported op #{op}, wanted r32 or r/m32" + end + end + end + + + def dec(op) + if register?(op) + # dec r16 / dec r32 + asm { emit_byte(0x48 + op.regnum) } + else + raise "unsupported DEC instruction, op=#{op.inspect}" + end + end + + + def shr(op, n) + + # shr r/m??, imm8 + if SignedByte === n + + opcode = register?(op, :byte) ? 0xc0 : 0xc1 + + asm do + emit_byte(opcode) + emit_modrm(op, 5) + emit_byte(n) + end + + else + raise "unsupported SHR instruction, op=#{op.inspect}, n=#{n.inspect}" + end + + end + + + def and_(dest, src) + if rm?(dest, 8) && immediate?(src, 8) + asm do + emit_byte(0x80) + emit_modrm(dest, 4) + emit_byte(src) + end + else + raise "unsupported AND instruction: dest=#{dest.inspect}, src=#{src.inspect}" + end + end + + + def xor(dest, src) + # xor r/m32, reg32 + if rm?(dest) && register?(src) + asm do + emit_byte(0x31) + emit_modrm(dest, src.regnum) + end + + else + raise "unsupported XOR instruction, dest=#{dest.inspect} src=#{src.inspect}" + end + end + + + def not_(op) + if rm?(op) + asm do + emit_byte(0xf7) + emit_modrm(op, 2) + end + else + raise "unsupported NOT instruction: op=#{op.inspect}" + end + end + + + def neg(op) + if rm?(op) + asm do + emit_byte(0xf7) + emit_modrm(op, 3) + end + else + raise "unsupported NEG instruction: op=#{op.inspect}" + end + end + + + def push(op) + # push reg32 + if register?(op) + asm { emit_byte(0x50 + op.regnum) } + + elsif immediate?(op, :byte) + asm do + emit_byte(0x6a) + emit_byte(op) + end + + elsif immediate?(op) + asm do + emit_byte(0x68) + emit_dword(op) + end + + else + raise "unsupported PUSH instruction: op=#{op.inspect}" + end + end + + + def pop(op) + # pop reg32 + if register?(op) + asm { emit_byte(0x58 + op.regnum) } + + else + raise "unsupported POP instruction: op=#{op.inspect}" + end + end + + + def cmp(op1, op2) + # cmp r/m32, reg32 + if rm?(op1) && register?(op2) + asm do + emit_byte(0x39) + emit_modrm(op1, op2.regnum) + end + + # cmp eax, imm32 + elsif op1 == EAX && immediate?(op2) + asm do + emit_byte(0x3d) + emit_dword(op2) + end + + else + raise "unsupported CMP instruction: op1=#{op1.inspect} op2=#{op2.inspect}" + end + end + + + # Only jmp rel32 is supported. + def jmp(label) + asm do + emit_byte(0xe9) + emit_future_addr(label) + end + end + + # These all jump near (rel32). + JccOpcodeMap = Hash.new { |key| raise "unsupported Jcc instruction: #{key}" }. + merge({ + :jc => 0x82, # carry (CF=1) + :je => 0x84, # equal (ZF=1) --- same as jz + :jg => 0x8f, # greater (ZF=0 and SF=OF) + :jl => 0x8c, # less than (SF!=OF) + :jne => 0x85, # not equal (ZF=0) --- same as jnz + :jng => 0x8e, # not greater than (ZF=1 or SF!=OF) + :jnl => 0x8d, # not less than (SF=OF) + :jnz => 0x85, # not zero (ZF=0) + :jo => 0x80, # overflow (OF=1) + :js => 0x88, # sign (SF=1) + :jz => 0x84 # zero (ZF=1) + }) + + # Only Jcc rel32 is supported. + def jcc(instruction, label) + opcode = JccOpcodeMap[instruction] + asm do + emit_byte(0x0f) + emit_byte(opcode) + emit_future_addr(label) + end + end + + JccOpcodeMap.keys.each do |name| + define_method(name) do |label| + jcc(name, label) + end + end + + + def lea(r32, mem) + asm do + emit_byte(0x8d) + emit_modrm(mem, r32.regnum) + end + end + + + def int(n) + asm do + emit_byte(0xcd) + emit_byte(n) + end + end + + + def ret + asm { emit_byte(0xc3) } + end + + + def leave + asm { emit_byte(0xc9) } + end + + + # TODO remove this, LOOP sucks ... only accepts a 1-byte signed offset. + def loop_(label) + real_eip = @eip + 2 # loop instruction is 2 bytes + delta = @symtab.lookup_label(label) - real_eip + unless SignedByte === delta + raise "LOOP can only jump -128 to 127 bytes, #{label} is #{delta} bytes away" + end + + asm do + emit_byte(0xe2) + emit_byte(delta) + end + end + + + end # class Binary end # module Assembler diff --git a/asm/cstruct.rb b/asm/cstruct.rb new file mode 100644 index 0000000..574505d --- /dev/null +++ b/asm/cstruct.rb @@ -0,0 +1,320 @@ +# Struct does some trickery with custom allocators so we can't subclass it without writing C. +# Instead we define a CStruct class that does something similar enough for our purpose. It is +# subclassed just like any other class. A nice side-effect of this syntax is that it is always +# clear that a CStruct is just a class and instances of the struct are objects. +# +# Some light metaprogramming is used to make the following syntax possible: +# +# class MachHeader < CStruct +# uint :magic +# int :cputype +# int :cpusubtype +# ... +# int :flags +# end +# +# Inheritance works as you would expect. +# +# class LoadCommand < CStruct +# uint32 :cmd +# uint32 :cmdsize +# end +# +# # inherits cmd and cmdsize as the first 2 fields +# class SegmentCommand < LoadCommand +# string :segname, 16 +# uint32 :vmaddr +# uint32 +# end +# +# Nothing tricky or confusing there. Members of a CStruct class are declared in the +# class definition. A different definition using a more static approach probably wouldn't +# be very hard... if performance is critical ... but then why are you using Ruby? ;-) + +class CStruct + + + ################### + # Class Constants # + ################### + + # Size in bytes. + SizeMap = { + :int8 => 1, + :uint8 => 1, + :int16 => 2, + :uint16 => 2, + :int32 => 4, + :uint32 => 4, + :string => lambda { |*opts| opts.first }, # first opt is size + # the last 3 are to make the language more C-like + :int => 4, + :uint => 4, + :char => 1 + } + + # 32-bit + PackMap = { + :int8 => 'c', + :uint8 => 'C', + :int16 => 's', + :uint16 => 'S', + :int32 => 'i', + :uint32 => 'I', + :string => lambda do |str, *opts| + len = opts.first + str.ljust(len, "\0")[0, len] + end, + # a few C-like names + :int => 'i', + :uint => 'I', + :char => 'C' + } + + # Only needed when unpacking is different from packing, i.e. strings w/ lambdas in PackMap. + UnpackMap = { + :string => lambda do |str, *opts| + len = opts.first + val = str[0, len-1].sub(/\0*$/, '') + str.slice!((len-1)..-1) + val + end + } + + ########################## + # Class Instance Methods # + ########################## + + # Note: const_get and const_set are used so the constants are bound at runtime, to the + # real class that has subclassed CStruct. I figured Ruby would do this but I haven't + # looked at the implementation of constants so it might be tricky. + # + # All of this could probably be avoided with Ruby 1.9 and private class variables. + # That is definitely something to experiment with. + + class < and . + +module MachO + + + ############### + # Mach header # + ############### + + # Appears at the beginning of every Mach object file. + class MachHeader < CStruct + uint32 :magic + int32 :cputype + int32 :cpusubtype + uint32 :filetype + uint32 :ncmds + uint32 :sizeofcmds + uint32 :flags + end + + # Values for the magic field. + MH_MAGIC = 0xfeedface # Mach magic number. + MH_CIGAM = 0xcefaedfe # In the reverse byte-order. + + # Values for the filetype field. + MH_OBJECT = 0x1 + MH_EXECUTE = 0x2 + MH_FVMLIB = 0x3 + MH_CORE = 0x4 + MH_PRELOAD = 0x5 + MH_DYLIB = 0x6 + MH_DYLINKER = 0x7 + MH_BUNDLE = 0x8 + MH_DYLIB_STUB = 0x9 + MH_DSYM = 0xa + + # CPU types and subtypes (only Intel for now). + CPU_TYPE_X86 = 7 + CPU_TYPE_I386 = CPU_TYPE_X86 + CPU_SUBTYPE_X86_ALL = 3 + + + ############################ + # Load commands / segments # + ############################ + + class LoadCommand < CStruct + uint32 :cmd + uint32 :cmdsize + end + + # Values for the cmd member of LoadCommand CStructs (incomplete!). + LC_SEGMENT = 0x1 + LC_SYMTAB = 0x2 + LC_SYMSEG = 0x3 + LC_THREAD = 0x4 + LC_UNIXTHREAD = 0x5 + + class SegmentCommand < LoadCommand + string :segname, 16 + uint32 :vmaddr + uint32 :vmsize + uint32 :fileoff + uint32 :filesize + int32 :maxprot + int32 :initprot + uint32 :nsects + uint32 :flags + end + + + # Values for protection fields, maxprot and initprot. + VM_PROT_NONE = 0x00 + VM_PROT_READ = 0x01 + VM_PROT_WRITE = 0x02 + VM_PROT_EXECUTE = 0x04 + VM_PROT_NO_CHANGE = 0x08 + VM_PROT_COPY = 0x10 + + + class SymtabCommand < LoadCommand + uint32 :symoff # Points to an array of Nlist structs. + uint32 :nsyms # Number of entries in said array. + uint32 :stroff # Offset of the string table. + uint32 :strsize # Size of the string table in bytes. + end + + + LoadCommandStructMap = { + LC_SEGMENT => SegmentCommand, + LC_SYMTAB => SymtabCommand + } + + + ############ + # Sections # + ############ + + class Section < CStruct + string :sectname, 16 + string :segname, 16 + uint32 :addr + uint32 :size + uint32 :offset + uint32 :align + uint32 :reloff + uint32 :nreloc + uint32 :flags + uint32 :reserved1 + uint32 :reserved2 + end + + # Values for the type bitfield (mask 0x000000ff) of the flags field. + # (incomplete!) + S_REGULAR = 0x0 + S_ZEROFILL = 0x1 + S_CSTRING_LITERALS = 0x2 + + + + ######################## + # Symbol table support # + ######################## + + + # Nlist is used to describe symbols. + class Nlist < CStruct + uint32 :n_strx # Index into string table. Index of zero is the empty string. + uint8 :n_type # Type flag (see below). + uint8 :n_sect # Section number (from 1) or NO_SECT. + uint16 :n_desc # TODO See . + uint32 :n_value # The symbol's value (or stab offset). + end + + # Type flag (see for more details) + # --------- + # + # This field consists of four bitfields: + # + # uchar N_STAB : 3 + # uchar N_PEXT : 1 + # uchar N_TYPE : 3 + # uchar N_EXT : 1 + # + N_STAB = 0xe0 # if any bits set => symbolic debugging info + N_PEXT = 0x10 # private external symbol bit + N_TYPE = 0x0e # mask for the type bits + N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals) + + # Values for N_TYPE. (incomplete!) + N_UNDF = 0x0 # undefined, n_sect == NO_SECT + N_ABS = 0x2 # absolute, n_sect == NO_SECT + N_SECT = 0xe # defined in section number n_sect + + NO_SECT = 0 + MAX_SECT = 255 + +end \ No newline at end of file diff --git a/asm/machofile.rb b/asm/machofile.rb new file mode 100644 index 0000000..5ff1b5a --- /dev/null +++ b/asm/machofile.rb @@ -0,0 +1,364 @@ +require 'asm/macho' + +module Assembler + + class MachOFile + + include MachO + + attr_accessor :header, :load_commands, :sections, :data + attr_accessor :current_segment + attr_accessor :text_offset + + def initialize(filetype=MH_OBJECT) + @header = MachHeader.new(MH_MAGIC, CPU_TYPE_X86, CPU_SUBTYPE_X86_ALL, filetype, 0, 0, 0) + @load_commands = [] # All defined segments. + @sections = {} # Map of segment names to lists of segments. + @section_disk_size = Hash.new(0) # Sections store their VM size so we need their sizes on disk. + @data = [] # Blobs of data that appear at the end of the file. + # (text, data, symtab, ...) + @current_segment = nil # An alias for the last defined segment. + + # Leave room for __PAGEZERO, a single 0x1000 (4kb) page at 0x0. The + # __TEXT segment starts at 0x1000 and contains mach headers and load + # commands. + @text_offset = 0x1000 + end + + + # Define a LoadCommand in this file. The header's ncmds and sizeofcmds + # fields are updated automatically to keep things in sync. If a block is + # given it is passed the new LoadCommand struct after all other + # initialization has been done. + # + # Other methods that create any type of load command should use this + # method to do so. Right now the only types supported are LC_SEGMENT + # and LC_SYMTAB. Modify asm/macho.rb to add structs for other types, and + # add them to LoadCommandStructMap. + + def load_command(cmdtype) + struct = LoadCommandStructMap[cmdtype] + unless struct + raise "unsupported load command type: #{cmdtype.inspect}," + + " supported types: #{LoadCommandStructMap.keys.sort.inspect}" + end + + # Fill in all the unknown fields with 0, this is nonsense for + # string fields but that doesn't really matter. + dummy_vals = [0] * (struct::Members.size - 2) + + # cmd cmdsize ... + command = struct.new(cmdtype, struct.bytesize, *dummy_vals) + + @load_commands << command + + @header[:ncmds] += 1 + @header[:sizeofcmds] += command.bytesize + + yield(command) if block_given? + + return command + end + + + # Define a segment in this file. If a block is given it is passed + # the new segment. You can chain calls to segment, it returns self. + # + # Mach object files should only contain one anonymous segment. This + # is not checked but should be kept in mind when crafting files. + def segment(name, &block) + @current_segment = load_command(LC_SEGMENT) do |seg| + seg[:segname] = name + block.call(seg) if block + end + return self + end + + + # Define a section under the given segment. nsects and cmdsize are + # updated automatically. segname can't be derived from the segment + # that this section is defined under, as they can differ. + # + # Mach object files have the __text, __data, and other common + # sections all defined under one anonymous segment, but their segment + # names reflect their final positions after linking. The linker plonks + # them in the segment that they name. + def section(name, segname, data='', vmsize=data.size, + segment=@current_segment, type=S_REGULAR) + + # Create the new section. + section = Section.new(name, segname, 0, vmsize, 0, 0, 0, 0, 0, 0, type) + + # Add this section to the map of segment names to sections. + (@sections[segment[:segname]] ||= []) << section + @section_disk_size[name] = data.size + @data << data if data.size > 0 + + # Update the header. + @header[:sizeofcmds] += section.bytesize + + # Update the segment. + segment[:nsects] += 1 + segment[:cmdsize] += section.bytesize + + yield(section) if block_given? + + return section + end + + + + # Define a standard text section under the current segment (if present). + # + # If there is no current segment then we act according to the file's type + # (specified in the header). Segments are created if they do not exist. + # + # When it is MH_OBJECT the text section is defined under a single, + # nameless segment, but the section's segment name is set to the name + # given here. + # + # For MH_EXECUTE files the text section goes under the segment with the + # name given (__TEXT). + + def text(data, sectname='__text', segname='__TEXT') + unless @current_segment + segment(segname_based_on_filetype(segname)) do |seg| + seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE + seg[:initprot] = VM_PROT_READ | VM_PROT_EXECUTE + end + end + + section(sectname, segname, data) do |sect| + sect[:flags] = 0x400 # S_ATTR_SOME_INSTRUCTIONS + end + + return self + end + + + # Define a standard data section under the current segment (if present). + # This behaves similarly to the text method. + # + def data(data, sectname='__data', segname='__DATA') + unless @current_segment + segment(segname_based_on_filetype(segname)) do |seg| + seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE + seg[:initprot] = VM_PROT_READ | VM_PROT_WRITE + end + end + + section(sectname, segname, data) + + return self + end + + + # Define a standard const section under the current segment (if present). + # This behaves similarly to the data method. + # + def const(data, sectname='__const', segname='__DATA') + unless @current_segment + segment(segname_based_on_filetype(segname)) do |seg| + seg[:maxprot] = VM_PROT_READ + seg[:initprot] = VM_PROT_READ + end + end + + section(sectname, segname, data) + + return self + end + + + # Define a standard BSS section under the current segment (if present). + # This behaves similarly to the data method but accepts a VM size instead + # of a blob, and no data is written to file since this section is for + # uninitialized data. + # + def bss(vmsize, sectname='__bss', segname='__DATA') + unless @current_segment + segment(segname_based_on_filetype(segname)) do |seg| + seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE + seg[:initprot] = VM_PROT_READ | VM_PROT_WRITE + end + end + + section(sectname, segname, '', vmsize) + + return self + end + + + # Define a symbol table. This should usually be placed at the end of the + # file. + # + # This function is overloaded to accept either an array of Nlist structs + # packed into a byte string (i.e. a C array) and a string table, or a + # single parameter: any type of Symtab. + + def symtab(nlist_ary_or_symtab, stab=nil) + if stab.nil? + symtab = nlist_ary_or_symtab + stab = symtab.stab + nlist_ary = symtab.nlist_ary + else + nlist_ary = nlist_ary_or_symtab + end + + load_command(LC_SYMTAB) do |st| + st[:nsyms] = nlist_ary.size + st[:strsize] = stab.size + # symoff and stroff are filled in when offsets are recalculated. + end + +# puts ">>> Defining symbol table:" +# puts ">>> #{nlist_ary.size} symbols" +# puts ">>> stab = #{stab.inspect}" +# puts ">>> nlist_ary = #{nlist_ary.inspect}" +# puts ">>> (serialized) = #{nlist_ary.map{|n|n.serialize}.join.inspect}" + + @data << nlist_ary.map {|n| n.serialize}.join + @data << stab + + return self + end + + + # Serialize the entire MachO file into a byte string. This is simple + # thanks to CStruct#serialize. + + def serialize + # TODO sanity checks, e.g. assert(@header[:ncmds] == @load_command.size) + # ... perhaps an option to recalculate such data as well. + + recalculate_offsets + + # |------------------| + # | Mach Header | + # |------------------| + # | Segment 1 | + # | Section 1 | --- + # | Section 2 | --|-- + # | ... | | | + # | Segment 2 | | | + # | Section 4 | | | + # | Section 5 | | | + # | ... | | | + # | ... | | | + # | [Symtab cmd] | | | + # |------------------| | | + # | Section data 1 | <-- | + # | Section data 2 | <---- + # | ... | + # | [Symtab data] | + # |------------------| + + + # dump the mach header + obj = @header.serialize + + # dump each load command (which include the section headers under them) + obj += @load_commands.map do |cmd| + sects = @sections[cmd[:segname]] rescue [] + sects.inject(cmd.serialize) do |data, sect| + data + sect.serialize + end + end.join + + # and finally dump the blobs at the end + obj += @data.join + + return obj + end + + + # Update the file offsets in SegmentCommand, SymtabCommand, and Section structs. + + def recalculate_offsets + + # Maintain the offset into the the file. This is used to update + # the various structures. + offset = @header.bytesize + + # First pass over load commands. Most sizes are filled in here. + @load_commands.each do |cmd| + case cmd[:cmd] + + when LC_SEGMENT + seg = cmd + sections = @sections[seg[:segname]] + section_size = sections.size * Section.bytesize + section_vm_size = sections.inject(0) { |total, sect| total + sect[:size] } + section_disk_size = sections.inject(0) do |total, sect| + total + @section_disk_size[sect[:sectname]] + end + + ### TODO this should be redundant. try commenting it out one day. + seg[:nsects] = sections.size + seg[:cmdsize] = seg.bytesize + section_size + ### + + seg[:vmsize] = section_vm_size + seg[:filesize] = section_disk_size + + when LC_SYMTAB + # nop + + else + raise "unsupported load command: #{cmd.inspect}" + end + + offset += cmd[:cmdsize] + end + + + # offset now points to the end of the Mach-O headers, or the beginning + # of the binary blobs of section data at the end. + + # Second pass over load commands. Fill in file offsets. + @load_commands.each do |cmd| + case cmd[:cmd]\ + + when LC_SEGMENT + seg = cmd + sections = @sections[seg[:segname]] + seg[:fileoff] = offset + sections.each do |sect| + sect[:offset] = offset + offset += @section_disk_size[sect[:sectname]] + end + + when LC_SYMTAB + st = cmd + st[:symoff] = offset + offset += st[:nsyms] * Nlist.bytesize + st[:stroff] = offset + offset += st[:strsize] + + + # No else clause is necessary, the first iteration should have caught them. + + end + + end # @load_commands.each + + end # def recalculate_offsets + + + ####### + private + ####### + + def segname_based_on_filetype(segname) + case @header[:filetype] + when MH_OBJECT: '' + when MH_EXECUTE: segname + else + raise "unsupported MachO file type! #{@header.inspect}" + end + end + + + end # class MachOFile + +end # module Assembler diff --git a/asm/machosym.rb b/asm/machosym.rb new file mode 100644 index 0000000..6f70e83 --- /dev/null +++ b/asm/machosym.rb @@ -0,0 +1,29 @@ +require 'asm/macho' + +module Assembler + + class MachOSym + + attr_accessor :name, :type, :segnum, :desc, :value + + def initialize(name, type, segnum, desc, value) + @name = name + @type = type + @segnum = segnum + @desc = desc + @value = value + end + + + def to_nlist(strx) + MachO::Nlist.new(strx, @type, @segnum, @desc, @value) + end + + + def to_s + @name + end + + end + +end \ No newline at end of file diff --git a/asm/machosymtab.rb b/asm/machosymtab.rb new file mode 100644 index 0000000..c6db0aa --- /dev/null +++ b/asm/machosymtab.rb @@ -0,0 +1,77 @@ +require 'asm/macho' +require 'asm/machosym' +require 'asm/symtab' + +module Assembler + + class MachOSymtab < Symtab + + include MachO + + def const_offset + return 0x2000 + end + + def bss_offset + # TODO figure out how to calculate these, or how to let the linker do it! + # ... relocation tables perhaps? + return 0x2800 + end + + def all_symbols + symbols = [] + + # Functions (section #1, __text) + # + # All labels are exported. This should be changed and only functions exported! + # TODO fixme ... + # + # Note: Sorting a Ruby hash gives an alist, e.g. [[, ], ...] + # We can use map on it as if it were a hash so it works nicely. + # + symbols += + @labels.sort { |a,b| a[1] <=> b[1] }. + map do |name,addr| + MachOSym.new(name, N_SECT | N_EXT, 1, 0, addr) + end + + # Constants (section #2, __const) + symbols += @consts.sort { |a,b| a[1] <=> b[1] }. + map do |name, addr| + MachOSym.new(name, N_SECT, 2, 0, addr) + end + + # Variables (section #3, __bss) + # + # TODO FIXME the last var exported ends up after main somewhere... WTF?! + symbols += @vars.sort { |a,b| a[1] <=> b[1] }. + map do |name, addr| + MachOSym.new(name, N_SECT, 3, 0, addr) + end + + return symbols + end + + def nlist_ary + symbols = {} + strx = 1 + ary = [] + all_symbols.each do |sym| + key = sym.name.to_sym + unless symbols.has_key?(key) + symbols[key] = strx + strx += sym.name.length + 1 # +1 for the null byte + end + ary << sym.to_nlist(symbols[key]) + end + return ary + end + + def stab + # The empty strings result in a string that begins and ends with + ['', all_symbols, ''].flatten.map { |sym| sym.to_s }.join("\0") + end + + end + +end diff --git a/asm/machowriter.rb b/asm/machowriter.rb new file mode 100644 index 0000000..9ecf83c --- /dev/null +++ b/asm/machowriter.rb @@ -0,0 +1,26 @@ + + ### XXX development hack! + def stub_symtab! + text_segnum = 1 + symtab_stub = { + :functions => [ + # name type segnum addr + ['_main', N_SECT | N_EXT, text_segunm, 0x0] + ] + } + + nlist_ary = [] + stab = "\0" + strx = 1 # string index (1-based) + + symtab[:functions].each do |name, type, segnum, addr| + nlist_ary << MachO::Nlist.new(strx, type, segnum, 0, addr) + stab << "#{name}\0" + strx += 1 + end + symtab(nlist_ary, stab) + end + + end + +end \ No newline at end of file diff --git a/asm/objwriter.rb b/asm/objwriter.rb new file mode 100644 index 0000000..485bd10 --- /dev/null +++ b/asm/objwriter.rb @@ -0,0 +1,26 @@ +module Assembler + + class UnimplementedMethodError < RuntimeError; end + + + # Abstract base class. + class ObjWriter + + def write!(filename) + File.open(filename, 'wb') do |file| + file.print(serialize) + end + end + + def fail(name) + raise UnimplementedMethodError, name + end + + # These methods must be defined for most uses of the library. + %w[header segment section text data bss symtab serialize].each do |name| + define_method(name) { fail(name) } + end + + end + +end \ No newline at end of file diff --git a/asm/registers.rb b/asm/registers.rb new file mode 100644 index 0000000..fcef603 --- /dev/null +++ b/asm/registers.rb @@ -0,0 +1,32 @@ +require 'asm/regproxy' + +module Assembler + + module Registers + + # This structure allows for x86 registers of all sizes. The + # number of the register is the index of the array in which it was + # found. The size of a register in bytes is 2 ** index-into-sub-array. + Registers = [ [:al, :ax, :eax], # 0 + [:cl, :cx, :ecx], # 1 + [:dl, :dx, :edx], # 2 + [:bl, :bx, :ebx], # 3 + [:ah, :sp, :esp], # 4 + [:ch, :bp, :ebp], # 5 + [:dh, :si, :esi], # 6 + [:bh, :di, :edi] # 7 + ] + + # Setup register proxies which are used both in effective address + # calculations, and also just as symbols representing registers. + Registers.each_with_index do |group, regnum| + group.each_with_index do |reg, i| + name = reg.to_s.upcase + const_set(name, RegisterProxy.new(reg, 8 * (2 ** i), regnum)) + end + end + + + end + +end \ No newline at end of file diff --git a/asm/regproxy.rb b/asm/regproxy.rb new file mode 100644 index 0000000..e7ac6b0 --- /dev/null +++ b/asm/regproxy.rb @@ -0,0 +1,67 @@ +module Assembler + + # Acts like a register and can be used as the base or index in an + # effective address. + # + # e.g. [EAX] or [ESI+EBX] or [EAX + 0xff] or [EAX + EDX * 2] + class RegisterProxy + + attr_reader :name, :size, :regnum + attr_reader :base, :index, :scale + + + def initialize(name, size, regnum) + @name = name # attrs are read-only so sharing is ok + @size = size + @regnum = regnum + @base = self + end + + + def +(index) + raise "index already specified" if @index + new_reg = self.clone + new_reg.instance_variable_set('@index', index) + new_reg + end + + + def *(scale) + raise "index must come first" unless @index + raise "scale already specified" if scale + raise "unsupported scale: #{scale}" unless scale.to_s.match(/^[1248]$/) + @scale = scale + self + end + + + def scale? + @scale + end + + + def index? + @index + end + + + def register? + @scale.nil? && @index.nil? + end + + + + def to_s + @name.to_s + + (@index ? "+#{@index}" : '') + + (@scale ? "*#{@scale}" : '') + end + + + def inspect + to_s + end + + end + +end \ No newline at end of file diff --git a/asm/symtab.rb b/asm/symtab.rb new file mode 100644 index 0000000..c0b5100 --- /dev/null +++ b/asm/symtab.rb @@ -0,0 +1,89 @@ +module Assembler + + class Symtab + + attr_reader :const_data, :bss_size + + def initialize + @vars = {} # Map of variable names to addresses. (bss vars) + @consts = {} # Map of constant names to addresses. + @funcs = {} # map of function names to addresses. + + # Initial data to load into memory (data for __DATA segment). + @const_data = '' + + @const_size = 0 # Size of const section. + @bss_size = 0 # Size of bss section. + + # Map names to locations. + @labels = Hash.new {|h, key| raise "undefined label: #{key}"} + @num_labels = 0 # Used to generate unique labels. + @num_labels_with_suffix = Hash.new(0) + end + + + #### + ## NB: Concrete subclasses must define methods named: + ## bss_offset, and const_offset + #### + + + # Generate a unique label. + def unique_label(suffix=nil) + @num_labels += 1 + if suffix + @num_labels_with_suffix[suffix] += 1 + suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}" + end + name = "L#{sprintf "%06d", @num_labels}#{suffix}" + return name + end + + def deflabel(name, addr) + @labels[name] = addr + return name + end + + + def lookup_label(name) + @labels[name] + end + + + def defvar(name, bytes) + @vars[name] = @bss_size + @bss_size += bytes + end + + + def defconst(name, value, bytes) + @consts[name] = @const_size + @const_size += bytes + @const_data << [value].pack('i') + end + + + def defun(name, addr) + @funcs[name] = addr + end + + + def var(name) + bss_offset + @vars[name] + end + + def var?(name) + @vars[name] + end + + def const(name) + const_offset + @consts[name] + end + + def const?(name) + @consts[name] + end + + end + +end diff --git a/asm/text.rb b/asm/text.rb index cb152dc..474087e 100644 --- a/asm/text.rb +++ b/asm/text.rb @@ -3,9 +3,6 @@ # sjs # may 2009 -ROOT = __FILE__.sub(/\/asm\/text\.rb$/, '') unless defined? ROOT -$LOAD_PATH << ROOT unless $LOAD_PATH.include?(ROOT) - require 'asm/asm' module Assembler @@ -15,8 +12,9 @@ module Assembler # correct machine code, which isn't trivial. class Text < AssemblerBase - def initialize(platform='linux') + def initialize(platform) super + @vars = {} # Symbol table, maps names to locations in BSS. @data = '' @bss = '' @code = '' @@ -39,6 +37,13 @@ module Assembler end end + + def var(name) + @vars[name] + end + alias_method :var?, :var + + # Emit a line of code wrapped between a tab and a newline. def emit(code, options={}) tab = options.has_key?(:tab) ? options[:tab] : "\t" @@ -106,6 +111,10 @@ module Assembler emit("call #{label}") end + def leave + emit("leave") + end + def neg(reg) emit("neg #{reg}") end @@ -165,6 +174,10 @@ module Assembler def int(num) emit("int 0x#{num.to_s(16)}") end + + def cdq + emit("cdq") + end end end diff --git a/build.rb b/build.rb index 16fedc5..323114c 100755 --- a/build.rb +++ b/build.rb @@ -3,10 +3,21 @@ require 'compiler' require 'asm/text' require 'asm/binary' +require 'asm/machosymtab' +require 'asm/machofile' + +# usage: build.rb [elf | macho ] [asm | bin] def main filename = ARGV[0].to_s - raise "can't read #{filename}" unless File.readable?(filename) + raise "can't read #{filename}" unless File.readable?(filename) + binformat = ARGV[1] ? ARGV[1].downcase : 'elf' + format = ARGV[2] ? ARGV[2].downcase : 'asm' + platform = `uname -s`.chomp.downcase + puts "Building #{format} from #{filename} for #{platform}, binformat is #{binformat} ..." + outfile = build(filename, platform, format, binformat) + puts outfile + exit end @@ -18,23 +29,18 @@ def base(filename) end -# filename: input filename +# infile: input filename +# outfile: output filename # asm: assembler to use -# returns: output filename -def compile(filename, asm) +def compile(infile, outfile, asm) - File.open(filename, 'r') do |input| - compiler = Compiler.new(input, asm) - compiler.compile + File.open(infile, 'r') do |input| + File.open(outfile, 'wb') do |out| + compiler = Compiler.new(input, asm) + out.print(compiler.compile) + end end - ext = asm.class.name.split('::').last[0,3].downcase == 'bin' ? 'bin' : 'asm' - outfile = "#{base(filename)}.#{ext}" - File.open(outfile, 'wb') do |out| - out.puts(asm.output) - end - return outfile - rescue ParseError => e error("[error] #{e.message}") error("[context] #{e.context}") @@ -44,12 +50,13 @@ rescue ParseError => e end # assemble using nasm, return resulting filename. -def asm(filename, binformat='elf') +def assemble(filename, binformat='elf') f = base(filename) outfile = "#{f}.o" - output = `nasm -f #{binformat} -g -o #{outfile} #{filename}` + output = `nasm -f #{binformat} -g -o #{outfile} #{filename} 2>&1` if $?.exitstatus != 0 - puts output + puts + print output raise "nasm failed: #{$?.exitstatus}" end return outfile @@ -64,32 +71,41 @@ def link(filename, platform='linux') else raise "unsupported platform: #{platform}" end - output = `#{cmd} #{args} -o #{f} #{filename}` + output = `#{cmd} #{args} -o #{f} #{filename} 2>&1` if $?.exitstatus != 0 - puts output + puts + print output raise "ld failed: #{$?.exitstatus}" end - `chmod +x #{f}` + `chmod u+x #{f}` return f end -# TODO Use a dependency injection framework for the assembler, and -# other parts as things become more modular. -def build(filename, platform='linux', format='asm', binformat='elf') - bin = if format == 'asm' - code = compile(filename, Assembler::Text.new(platform)) - obj = asm( code, binformat ) - link( obj, platform ) - else # binary - obj = compile(filename, Assembler::Binary.new(platform)) - link( obj, platform ) - end - return bin +def build(filename, platform='linux', binformat='elf') + objfile = base(filename) + '.o' + symtab, objwriter = + case binformat + when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile.new] + when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile.new] + else + raise "unsupported binary format: #{binformat}" + end + compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter)) + exefile = link(objfile, platform) + return exefile +end + +def build_asm(filename, platform='linux', binformat='elf') + asmfile = base(filename) + '.asm' + compile(filename, asmfile, Assembler::Text.new(platform)) + objfile = assemble(asmfile, binformat) + exefile = link(objfile, platform) + return exefile end def run(filename) filename = "./#{filename}" unless filename.include?('/') - system(filename) + `#{filename}` return $?.exitstatus end diff --git a/compiler.rb b/compiler.rb index 5bf9fcf..5004a00 100644 --- a/compiler.rb +++ b/compiler.rb @@ -12,6 +12,8 @@ # require 'rubygems' # require 'unroller' +require 'asm/registers' + class ParseError < StandardError attr_reader :caller, :context def initialize(caller, context=nil) @@ -22,6 +24,8 @@ end class Compiler + include Assembler::Registers + Keywords = %w[ if else end while until repeat for to do break print @@ -82,7 +86,7 @@ class Compiler asm.call(name) else # variable access - asm.mov(:eax, "dword [#{name}]") + asm.mov(EAX, [asm.var(name)]) end end @@ -95,7 +99,7 @@ class Compiler elsif alpha?(@look) identifier # or call elsif digit?(@look) - asm.mov(:eax, get_number.to_i) + asm.mov(EAX, get_number.to_i) else expected(:'integer, identifier, function call, or parenthesized expression', :got => @look) end @@ -106,7 +110,7 @@ class Compiler sign = @look match(sign) if op?(:unary, sign) factor - asm.neg(:eax) if sign == '-' + asm.neg(EAX) if sign == '-' end # Parse and translate a single term (factor or mulop). Result is in @@ -115,11 +119,10 @@ class Compiler signed_factor # Result in eax. while op?(:mul, @look) - pushing(:eax) do - case @look - when '*': multiply - when '/': divide - end + asm.push(EAX) + case @look + when '*': multiply + when '/': divide end end end @@ -130,11 +133,10 @@ class Compiler term # Result is in eax. while op_char?(@look, :add) - pushing(:eax) do - case @look - when '+': add - when '-': subtract - end + asm.push(EAX) + case @look + when '+': add + when '-': subtract end end end @@ -144,7 +146,8 @@ class Compiler def add match('+') term # Result is in eax. - asm.add(:eax, '[esp]') # Add a to b. + asm.pop(EBX) + asm.add(EAX, EBX) # Add a to b. end # Parse a subtraction operator and the 2nd term (b). The result is @@ -152,8 +155,9 @@ class Compiler def subtract match('-') term # Result, b, is in eax. - asm.neg(:eax) # Fake the subtraction. a - b == a + -b - asm.add(:eax, '[esp]') # Add a and -b. + asm.pop(EBX) + asm.neg(EAX) # Fake the subtraction. a - b == a + -b + asm.add(EAX, EBX) # Add a(ebx) to -b(eax). end # Parse an addition operator and the 2nd term (b). The result is @@ -161,7 +165,8 @@ class Compiler def multiply match('*') signed_factor # Result is in eax. - asm.imul('dword [esp]') # Multiply a by b. + asm.pop(EBX) + asm.imul(EBX) # Multiply a by b. end # Parse a division operator and the divisor (b). The result is @@ -169,14 +174,15 @@ class Compiler def divide match('/') signed_factor # Result is in eax. - asm.xchg(:eax, '[esp]') # Swap the divisor and dividend into + asm.pop(EBX) + asm.xchg(EAX, EBX) # Swap the divisor and dividend into # the correct places. # idiv uses edx:eax as the dividend so we need to ensure that edx # is correctly sign-extended w.r.t. eax. asm.cdq # Sign-extend eax into edx (Convert Double to # Quad). - asm.idiv('dword [esp]') # Divide a (eax) by b ([esp]). + asm.idiv(EBX) # Divide a (eax) by b (ebx). end @@ -187,19 +193,22 @@ class Compiler def bitor_expr match('|') term - asm.or(:eax, '[esp]') + asm.pop(EBX) + asm.or_(EAX, EBX) end def bitand_expr match('&') signed_factor - asm.and_(:eax, '[esp]') + asm.pop(EBX) + asm.and_(EAX, EBX) end def xor_expr match('^') term - asm.xor(:eax, '[esp]') + asm.pop(EBX) + asm.xor(EAX, EBX) end @@ -232,9 +241,9 @@ class Compiler def boolean_factor if boolean?(@look) if get_boolean == 'true' - asm.mov(:eax, -1) + asm.mov(EAX, -1) else - asm.xor(:eax, :eax) + asm.xor(EAX, EAX) end scan else @@ -246,8 +255,8 @@ class Compiler if @look == '!' match('!') boolean_factor - make_boolean(:eax) # ensure it is -1 or 0... - asm.not(:eax) # so that not is also boolean not + make_boolean(EAX) # ensure it is -1 or 0... + asm.not_(EAX) # so that 1's complement NOT is also boolean not else boolean_factor end @@ -255,8 +264,8 @@ class Compiler # Convert any identifier to a boolean (-1 or 0). This is # semantically equivalent to !!reg in C or Ruby. - def make_boolean(reg=:eax) - end_label = asm.label(:endmakebool) + def make_boolean(reg=EAX) + end_label = asm.mklabel(:endmakebool) asm.cmp(reg, 0) # if false do nothing asm.jz(end_label) asm.mov(reg, -1) # truthy, make it true @@ -267,20 +276,19 @@ class Compiler expression if op_char?(@look, :rel) scan - pushing(:eax) do - case @value - when '==': eq_relation - when '!=': neq_relation - when '>': gt_relation - when '>=': ge_relation - when '<': lt_relation - when '<=': le_relation - end + asm.push(EAX) + case @value + when '==': eq_relation + when '!=': neq_relation + when '>': gt_relation + when '>=': ge_relation + when '<': lt_relation + when '<=': le_relation end end end - # a: [esp] + # a: # b: eax # # If b - a is zero then a = b, and make_boolean will leave the zero @@ -288,14 +296,15 @@ class Compiler # and make_boolean will leave -1 (true) for us in eax. def neq_relation expression - asm.sub(:eax, '[esp]') + asm.pop(EBX) + asm.sub(EAX, EBX) make_boolean end # Invert the != test for equal. def eq_relation neq_relation - asm.not(:eax) + asm.not_(EAX) end # > and < are both implemented in terms of jl (jump if less than). @@ -303,7 +312,13 @@ class Compiler # and order the terms appropriately for each function. As for >= # and <=, they in turn are implemented in terms of > and <. a is # greater than or equal to b if and only if a is *not* less than b. - + # + # Note: This was done to minimize the number of instructions that + # the assembler needed to implement, but since the Jcc + # instructions are very cheap to implement this is no longer + # a concern. + + # The next 4 relations all compare 2 values a and b, then return # true (-1) if the difference was below zero and false (0) # otherwise (using JL, jump if less than). @@ -311,58 +326,62 @@ class Compiler # Invert the sense of the test? invert = options[:invert] - true_label = asm.label(:cmp) - end_label = asm.label(:endcmp) + true_label = asm.mklabel(:cmp) + end_label = asm.mklabel(:endcmp) asm.cmp(a, b) asm.jl(true_label) - asm.xor(:eax, :eax) # return false - asm.not(:eax) if invert # (or true if inverted) + asm.xor(EAX, EAX) # return false + asm.not_(EAX) if invert # (or true if inverted) asm.jmp(end_label) asm.emit_label(true_label) - asm.xor(:eax, :eax) # return true - asm.not(:eax) unless invert # (or false if inverted) + asm.xor(EAX, EAX) # return true + asm.not_(EAX) unless invert # (or false if inverted) asm.emit_label(end_label) end - # a: [esp] + # a: # b: eax # # if a > b then b - a < 0 def gt_relation expression - cmp_relation(:eax, '[esp]') # b - a + asm.pop(EBX) + cmp_relation(EAX, EBX) # b - a end - # a: [esp] + # a: # b: eax # # if a < b then a - b < 0 def lt_relation expression - cmp_relation('[esp]', :eax) # a - b + asm.pop(EBX) + cmp_relation(EBX, EAX) # a - b end - # a: [esp] + # a: # b: eax # # if a >= b then !(a < b) def ge_relation expression + asm.pop(EBX) # Compare them as in less than but invert the result. - cmp_relation('[esp]', :eax, :invert => true) + cmp_relation(EBX, EAX, :invert => true) end - # a: [esp] + # a: # b: eax # # if a <= b then !(a > b) def le_relation expression + asm.pop(EBX) # Compare them as in greater than but invert the result. - cmp_relation(:eax, '[esp]', :invert => true) + cmp_relation(EAX, EBX, :invert => true) end @@ -376,7 +395,7 @@ class Compiler match('=') boolean_expression asm.defvar(name) unless asm.var?(name) - asm.mov("dword [#{name}]", :eax) + asm.mov([asm.var(name)], EAX) end # Parse a code block. @@ -413,7 +432,7 @@ class Compiler # Parse an if-else statement. def if_else_stmt(label) - else_label = asm.label(:end_or_else) + else_label = asm.mklabel(:end_or_else) end_label = else_label # only generated if else clause # present condition @@ -424,7 +443,7 @@ class Compiler @indent -= 1 if @token == :keyword && @value == 'else' skip_any_whitespace - end_label = asm.label(:endif) # now we need the 2nd label + end_label = asm.mklabel(:endif) # now we need the 2nd label asm.jmp(end_label) asm.emit_label(else_label) @indent += 1 @@ -441,8 +460,8 @@ class Compiler # block: Code to execute at the start of each iteration. (e.g. a # condition) def simple_loop(name) - start_label = asm.label(:"loop_#{name}") - end_label = asm.label(:"end_#{name}") + start_label = asm.mklabel(:"#{name}_loop") + end_label = asm.mklabel(:"end_#{name}") asm.emit_label(start_label) yield(end_label) @@ -482,27 +501,29 @@ class Compiler # s = s + x # e def for_stmt - counter = "[#{get_name}]" + counter = get_name + asm.defvar(counter) match('=') - boolean_expression # initial value - asm.sub(:eax, 1) # pre-decrement because of the - # following pre-increment - asm.mov(counter, :eax) # stash the counter in memory + boolean_expression # initial value + asm.sub(EAX, 1) # pre-decrement because of the + # following pre-increment + asm.mov([asm.var(counter)], EAX) # stash the counter in memory match_word('to', :scan => true) - boolean_expression # final value + boolean_expression # final value skip_any_whitespace - asm.push(:eax) # stash final value on stack - final = '[esp]' + asm.push(EAX) # stash final value on stack + asm.mov(EDX, ESP) + final = [EDX] simple_loop('for') do |end_label| - asm.mov(:ecx, counter) # get the counter - asm.add(:ecx, 1) # increment - asm.mov(counter, :ecx) # store the counter - asm.cmp(final, :ecx) # check if we're done - asm.jz(end_label) # if so jump to the end + asm.mov(ECX, [asm.var(counter)]) # get the counter + asm.add(ECX, 1) # increment + asm.mov([asm.var(counter)], ECX) # store the counter + asm.cmp(final, ECX) # check if we're done + asm.jz(end_label) # if so jump to the end end - asm.add(:esp, 4) # clean up the stack + asm.add(ESP, 4) # clean up the stack end # do 5 @@ -512,19 +533,19 @@ class Compiler boolean_expression skip_any_whitespace - asm.mov(:ecx, :eax) + asm.mov(ECX, EAX) - start_label = asm.label(:do) - end_label = asm.label(:enddo) + start_label = asm.mklabel(:do) + end_label = asm.mklabel(:enddo) asm.emit_label(start_label) - asm.push(:ecx) + asm.push(ECX) @indent += 1 block(end_label) @indent -= 1 - asm.pop(:ecx) + asm.pop(ECX) match_word('end') asm.loop_(start_label) @@ -532,13 +553,13 @@ class Compiler # Phony push! break needs to clean up the stack, but since we # don't know if there is a break at this point we fake a push and # always clean up the stack after. - asm.sub(:esp, 4) + asm.sub(ESP, 4) asm.emit_label(end_label) # If there was a break we have to clean up the stack here. If # there was no break we clean up the phony push above. - asm.add(:esp, 4) + asm.add(ESP, 4) end def break_stmt(label) @@ -554,79 +575,83 @@ class Compiler def condition boolean_expression skip_whitespace - asm.cmp(:eax, 0) # 0 is false, anything else is true + asm.cmp(EAX, 0) # 0 is false, anything else is true end # print eax in hex format def print_stmt + # variable names + d = 'DIGITS' + h = 'HEX' + asm.block do # define a lookup table of digits - unless var?('DIGITS') - defvar('DIGITS', 4) - mov('dword [DIGITS]', 0x33323130) - mov('dword [DIGITS+4]', 0x37363534) - mov('dword [DIGITS+8]', 0x62613938) - mov('dword [DIGITS+12]', 0x66656463) + unless var?(d) + defvar(d, 4) + mov([var(d)], 0x33323130) + mov([var(d)+4], 0x37363534) + mov([var(d)+8], 0x62613938) + mov([var(d)+12], 0x66656463) end # 3 dwords == 12 chars - defvar('HEX', 3) unless var?('HEX') + defvar(h, 3) unless var?(h) # TODO check sign and prepend '-' if negative - mov('word [HEX]', 0x7830) # "0x" == [48, 120] - mov('word [HEX+10]', 0xa) # newline + null terminator + mov([var(h)], 0x7830) # "0x" == [48, 120] + mov([var(h)+10], 0xa) # newline + null terminator end boolean_expression asm.block do # convert eax to a hex string - lea(:esi, '[DIGITS]') - lea(:edi, '[HEX+9]') + lea(ESI, [var(d)]) + lea(EDI, [var(h)+9]) # build the string backwards (right to left), byte by byte - mov(:ecx, 4) + mov(ECX, 4) end - asm.emit_label(loop_label=asm.label) + asm.emit_label(loop_label=asm.mklabel) asm.block do # low nybble of nth byte - movzx(:ebx, :al) - and_(:bl, 0x0f) # isolate low nybble - movzx(:edx, 'byte [esi+ebx]') - mov('byte [edi]', :dl) - dec(:edi) + movzx(EBX, AL) + and_(BL, 0x0f) # isolate low nybble + movzx(EDX, [:byte, ESI+EBX]) + mov([EDI], DL) + dec(EDI) # high nybble of nth byte - movzx(:ebx, :al) - and_(:bl, 0xf0) # isolate high nybble - shr(:bl, 4) - mov(:dl, 'byte [esi+ebx]') - mov('byte [edi]', :dl) - dec(:edi) - shr(:eax, 8) + movzx(EBX, AL) + and_(BL, 0xf0) # isolate high nybble + shr(BL, 4) + mov(DL, [ESI+EBX]) + mov([EDI], DL) + dec(EDI) + shr(EAX, 8) loop_(loop_label) # write(int fd, char *s, int n) - mov(:eax, 4) # SYS_write - lea(:ecx, '[HEX]') # ecx = &s + mov(EAX, 4) # SYS_write + lea(ECX, [var(h)]) # ecx = &s args = [1, # fd = 1 (STDOUT) - :ecx, # s = &s + ECX, # s = &s 11] # n = 11 (excluding term, max # of chars to print) case platform when 'darwin' # on the stack, right to left (right @ highest addr) #### # setup bogus stack frame - push(:ebp) - mov(:ebp, :esp) - sub(:esp, 36) + push(EBP) + mov(EBP, ESP) + sub(ESP, 36) #### args.reverse.each { |a| push(a) } - push(:eax) + push(EAX) int(0x80) #### # teardown bogus stack frame - xor(:eax, :eax) - add(:esp, 36) - pop(:ebx) - emit("leave") + xor(EAX, EAX) + add(ESP, 36) + pop(EBX) + leave #### when 'linux' - mov(:ebx, args[0]) - mov(:ecx, args[1]) - mov(:edx, args[2]) + mov(EBX, args[0]) + mov(ECX, args[1]) + mov(EDX, args[2]) int(0x80) end end @@ -819,15 +844,15 @@ class Compiler def pushing(reg) asm.push(reg) yield - asm.add(:esp, 4) + asm.add(ESP, 4) end def op(name) - pushing(:eax) do - get_op - expected(name) unless match_word(name) - yield - end + asm.push(EAX) + get_op + expected(name) unless match_word(name) + yield + asm.add(ESP, 4) end diff --git a/elfwriter.c b/elfwriter.c deleted file mode 100644 index 95ce6f9..0000000 --- a/elfwriter.c +++ /dev/null @@ -1,288 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -/* _exit(0) */ -/* uint8_t shell_code[] = { */ -/* 0xbb, 0, 0, 0, 0, /\* mov ebx, 0 *\/ */ -/* 0xb8, 1, 0, 0, 0, /\* mov eax, 1 *\/ */ -/* 0xcd, 0x80 /\* int 0x80 *\/ */ -/* }; */ - -/* uint32_t hash_words[] = { */ -/* 0x12345678, */ -/* 0xdeadc0de, */ -/* 0x1234abcd */ -/* }; */ - -#define header_size 0x100 -#define text_addr 0x8048000 + header_size -#define text_size 0x02be00 -#define data_addr text_addr + text_size -#define data_size 0x4e00 -#define bss_addr data_addr + data_size -size_t bss_size = 0; - -char string_table[] = { - /* Offset 0 */ '\0', - /* Offset 1 */ '.', 't', 'e', 'x', 't', '\0' , - /* Offset 7 */ '.', 'b', 's', 's', '\0', - /* Offset 12 */ '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', '\0' -}; - - -/* Write a static 32-bit x86 ELF binary to filename. The file is - * clobbered without confirmation! - */ -int -elf_write(const char *filename, uint8_t *code, size_t code_size) -{ - int fd; - size_t shstrndx; - Elf *elf; - Elf_Scn *scn; - Elf_Data *data; - Elf32_Ehdr *ehdr; - Elf32_Phdr *phdr; - Elf32_Shdr *shdr; - - if (elf_version(EV_CURRENT) == EV_NONE) { - printf("Failed to initialize ELF library!\n"); - return -1; - } - if ((fd = open(filename, O_RDWR|O_TRUNC|O_CREAT, 0666)) < 0) { - printf("Can't open %s for writing.\n", filename); - perror("[elf_write]"); - return -2; - } - if ((elf = elf_begin(fd, ELF_C_WRITE, (Elf *)0)) == 0) { - printf("elf_begin failed!\n"); - return -3; - } - - - /************** - * ELF Header * - **************/ - - if ((ehdr = elf32_newehdr(elf)) == NULL) { - printf("elf32_newehdr failed!\n"); - return -4; - } - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; /* 2's complement, little endian */ - ehdr->e_type = ET_EXEC; - ehdr->e_machine = EM_386; /* x86 */ - - /* Image starts at 0x8048000, x86 32-bit abi. We need a bit - * of room for headers and such. TODO figure out how much - * room is needed! - * - * Current entry point is .text section. - */ - ehdr->e_entry = text_addr; - - - /******************* - * Program Headers * - *******************/ - - /* 3 segments => 3 program headers (text, data, bss) */ - if ((phdr = elf32_newphdr(elf, 3)) == NULL) { - printf("elf32_newphdr failed!\n"); - return -5; - } - - - /***************** - * .text section * - *****************/ - - if ((scn = elf_newscn(elf)) == NULL) { - printf("elf_newscn failed!\n"); - return -6; - } - if ((data = elf_newdata(scn)) == NULL) { - printf("elf_newdata failed!\n"); - return -7; - } - data->d_align = 16; - data->d_buf = code; - data->d_off = 0LL; - data->d_type = ELF_T_BYTE; - data->d_size = code_size; - data->d_version = EV_CURRENT; - - if ((shdr = elf32_getshdr(scn)) == NULL) { - printf("elf32_getshdr failed!\n"); - return -8; - } - shdr->sh_name = 1; - shdr->sh_type = SHT_PROGBITS; - shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - shdr->sh_addr = text_addr; - - - /**************** - * .bss section * - ****************/ - - if ((scn = elf_newscn(elf)) == NULL) { - printf("elf_newscn failed!\n"); - return -6; - } - if ((data = elf_newdata(scn)) == NULL) { - printf("elf_newdata failed!\n"); - return -7; - } - data->d_align = 4; - data->d_off = 0LL; - data->d_type = ELF_T_BYTE; - data->d_size = bss_size; - data->d_version = EV_CURRENT; - - if ((shdr = elf32_getshdr(scn)) == NULL) { - printf("elf32_getshdr failed!\n"); - return -8; - } - shdr->sh_name = 7; - shdr->sh_type = SHT_NOBITS; - shdr->sh_flags = SHF_WRITE | SHF_ALLOC; - shdr->sh_addr = bss_addr; - - - /******************************* - * section header string table * - *******************************/ - - if ((scn = elf_newscn(elf)) == NULL) { - printf("elf_newscn failed!\n"); - return -9; - } - if ((data = elf_newdata(scn)) == NULL) { - printf("elf_newdata failed!\n"); - return -10; - } - data->d_align = 1; - data->d_buf = string_table; - data->d_off = 0LL; - data->d_type = ELF_T_BYTE; - data->d_size = sizeof(string_table); - data->d_version = EV_CURRENT; - - if ((shdr = elf32_getshdr(scn)) == NULL) { - printf("elf32_getshdr failed!\n"); - return -11; - } - shdr->sh_name = 12; - shdr->sh_type = SHT_STRTAB; - shdr->sh_flags = SHF_STRINGS | SHF_ALLOC; - shdr->sh_entsize = 0; - - - /* int elf_setshstrndx(Elf *e, Elf32_Ehdr *eh, size_t shstrndx) */ - shstrndx = elf_ndxscn(scn); - if (shstrndx >= SHN_LORESERVE) { - if ((scn = elf_getscn(elf, 0)) == NULL) { - printf("elf_getscn failed!\n"); - return -12; - } - /* assert(scn->s_ndx == SHN_UNDEF); */ - /* scn->s_shdr.s_shdr32.sh_link = shstrndx; */ - elf_flagshdr(scn, ELF_C_SET, ELF_F_DIRTY); - shstrndx = SHN_XINDEX; - } - ehdr->e_shstrndx = shstrndx; - - if (elf_update(elf, ELF_C_NULL) < 0) { - printf("elf_update failed!\n"); - return -12; - } - - /* phdr->p_vaddr = phdr->p_paddr = 0x8048000 + ehdr->e_phoff; */ - /* phdr->p_type = PT_PHDR; */ - /* phdr->p_offset = ehdr->e_phoff; */ - /* phdr->p_filesz = elf32_fsize(ELF_T_PHDR, 1, EV_CURRENT); */ - - /* text segment */ - phdr->p_vaddr = text_addr; - phdr->p_type = PT_LOAD; - phdr->p_offset = header_size; - phdr->p_filesz = text_size; - phdr->p_memsz = text_size; - phdr->p_flags = PF_R | PF_X; - phdr->p_align = 0x1000; - - /* data segment */ - phdr++; - phdr->p_vaddr = data_addr; - phdr->p_type = PT_LOAD; - phdr->p_offset = header_size + text_size; - phdr->p_filesz = data_size; - phdr->p_memsz = data_size + 0x1024; /* XXX unsure why the abi specifies + 0x1024 */ - phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_align = 0x1000; - - /* bss segment */ - phdr++; - phdr->p_vaddr = bss_addr; - phdr->p_type = PT_LOAD; - phdr->p_offset = header_size + text_size + data_size; - phdr->p_filesz = bss_size; - phdr->p_memsz = bss_size; - phdr->p_flags = PF_R | PF_W; - phdr->p_align = 0x1000; - - elf_flagphdr(elf, ELF_C_SET, ELF_F_DIRTY); - - if (elf_update(elf, ELF_C_WRITE) < 0) { - printf("elf_update failed!\n"); - return -13; - } - - elf_end(elf); - close(fd); - return 0; -} - -int -main(int argc, const char *argv[]) -{ - int result; - pid_t pid; - FILE *fd; - uint8_t *code = NULL; - size_t code_size = 0, chunk_size = 1024, bytes_read; - - if (argc < 4) { - printf("usage: %s \n", argv[0]); - printf(" Wraps the input file in an ELF binary.\n"); - return 1; - } - - bss_size = strtoul(argv[2], 0, 10); - - if ((fd = fopen(argv[1], "r")) < 0) { - printf("[error] can't open %s for reading.\n", argv[1]); - perror("[main]"); - return 2; - } - while (!feof(fd) && !ferror(fd)) { - code = realloc(code, code_size + chunk_size); - bytes_read = fread(code+code_size, 1, chunk_size, fd); - code_size += bytes_read; - } - fclose(fd); - - printf("Writing x86 ELF binary to %s...\n", argv[3]); - result = elf_write(argv[3], code, code_size); - if (result < 0) { - printf("[error] elf_write failed.\n"); - return 3; - } - - return 0; -} diff --git a/lea.asm b/lea.asm deleted file mode 100644 index b2c9155..0000000 --- a/lea.asm +++ /dev/null @@ -1,12 +0,0 @@ -BITS 32 - -lea eax, [ebx+ecx*4] -lea ebx, [eax+ecx*4] -lea eax, [ecx+ebx*4] -lea eax, [ecx+ebx*8] -lea eax, [ecx+ebx] -lea eax, [0x1000+10*4] -lea eax, [eax] -lea eax, [ecx] -lea ecx, [eax] -lea eax, [0xdeadbeef] diff --git a/mov.asm b/mov.asm deleted file mode 100644 index 85d9ebf..0000000 --- a/mov.asm +++ /dev/null @@ -1,89 +0,0 @@ -BITS 32 - -;;; 00000000 b8 78 56 34 12 b9 78 56 34 12 ba 78 56 34 12 bb |.xV4..xV4..xV4..| -;;; 00000010 78 56 34 12 89 c0 89 c8 89 d0 89 d8 89 c1 89 c9 |xV4.............| -;;; 00000020 89 d1 89 d9 89 c2 89 ca 89 d2 89 da 89 c3 89 cb |................| -;;; 00000030 89 d3 89 db a1 ef be ad de 8b 0d ef be ad de 8b |................| -;;; 00000040 15 ef be ad de 8b 1d ef be ad de a3 ef be ad de |................| -;;; 00000050 89 0d ef be ad de 89 15 ef be ad de 89 1d ef be |................| -;;; 00000060 ad de 8b 00 8b 01 8b 02 8b 03 8b 08 8b 09 8b 0a |................| -;;; 00000070 8b 0b 8b 10 8b 11 8b 12 8b 13 8b 18 8b 19 8b 1a |................| -;;; 00000080 8b 1b 89 00 89 01 89 02 89 03 89 08 89 09 89 0a |................| -;;; 00000090 89 0b 89 10 89 11 89 12 89 13 89 18 89 19 89 1a |................| -;;; 000000a0 89 1b |..| -;;; 000000a2 - -mov eax, 0x12345678 ; b8 78 56 34 12 -mov ecx, 0x12345678 ; b9 78 56 34 12 -mov edx, 0x12345678 ; ba 78 56 34 12 -mov ebx, 0x12345678 ; bb 78 56 34 12 - -mov eax, eax ; 89 c0 -mov eax, ecx ; 89 c8 -mov eax, edx ; 89 d0 -mov eax, ebx ; 89 d8 - -mov ecx, eax ; 89 c1 -mov ecx, ecx ; 89 c9 -mov ecx, edx ; 89 d1 -mov ecx, ebx ; 89 d9 - -mov edx, eax ; 89 c2 -mov edx, ecx ; 89 ca -mov edx, edx ; 89 d2 -mov edx, ebx ; 89 da - -mov ebx, eax ; 89 c3 -mov ebx, ecx ; 89 cb -mov ebx, edx ; 89 d3 -mov ebx, ebx ; 89 db - -mov eax, dword [0xdeadbeef] ; a1 ef be ad de -mov ecx, dword [0xdeadbeef] ; 8b 0e ef be ad de -mov edx, dword [0xdeadbeef] ; 8b 16 ef be ad de -mov ebx, dword [0xdeadbeef] ; 8b 1e ef be ad de - -mov [0xdeadbeef], eax ; a3 ef be ad de -mov [0xdeadbeef], ecx ; 89 0e ef be ad de -mov [0xdeadbeef], edx ; 89 16 ef be ad de -mov [0xdeadbeef], ebx ; 89 1e ef be ad de - -mov eax, dword [eax] ; 8b 00 -mov eax, dword [ecx] ; 8b 01 -mov eax, dword [edx] ; 8b 02 -mov eax, dword [ebx] ; 8b 03 - -mov ecx, dword [eax] ; 8b 08 -mov ecx, dword [ecx] ; 8b 09 -mov ecx, dword [edx] ; 8b 0a -mov ecx, dword [ebx] ; 8b 0b - -mov edx, dword [eax] ; 8b 10 -mov edx, dword [ecx] ; 8b 11 -mov edx, dword [edx] ; 8b 12 -mov edx, dword [ebx] ; 8b 13 - -mov ebx, dword [eax] ; 8b 18 -mov ebx, dword [ecx] ; 8b 19 -mov ebx, dword [edx] ; 8b 1a -mov ebx, dword [ebx] ; 8b 1b - -mov [eax], eax ; 89 00 -mov [ecx], eax ; 89 01 -mov [edx], eax ; 89 02 -mov [ebx], eax ; 89 03 - -mov [eax], ecx ; 89 08 -mov [ecx], ecx ; 89 09 -mov [edx], ecx ; 89 0a -mov [ebx], ecx ; 89 0b - -mov [eax], edx ; 89 10 -mov [ecx], edx ; 89 11 -mov [edx], edx ; 89 12 -mov [ebx], edx ; 89 13 - -mov [eax], ebx ; 89 18 -mov [ecx], ebx ; 89 19 -mov [edx], ebx ; 89 1a -mov [ebx], ebx ; 89 1b diff --git a/test/Makefile b/test/Makefile index bff7a29..19c43ab 100644 --- a/test/Makefile +++ b/test/Makefile @@ -56,7 +56,7 @@ break: test.rb test_break.code print: test.rb test_print.code @./test.rb print $(BINFORMAT) -big_test: test.rb big_test.code +big_test: test.rb test_big.code @./test.rb big $(BINFORMAT) clean: diff --git a/test/test.rb b/test/test.rb index 6227086..83c083d 100755 --- a/test/test.rb +++ b/test/test.rb @@ -5,20 +5,20 @@ $LOAD_PATH << ROOT require 'build' -# usage: build.rb [binformat] -# -# ([format] will go before [binformat]) +# usage: test.rb [binformat] [format] def main func = ARGV[0].to_s - format = 'asm' # 'bin' only assembles one or two - # instructions right now, but support - # is in place - binformat = (ARGV[1] ? ARGV[1] : 'elf').downcase + binformat = ARGV[1] ? ARGV[1].downcase : 'elf' + format = ARGV[2] ? ARGV[2].downcase : 'asm' platform = `uname -s`.chomp.downcase print "testing #{func} ... " - success = run( build("test_#{func}.code", platform, format, binformat) ) - puts success == 0 ? "pass" : "FAIL! (#{success})" + success = run( build("test_#{func}.code", platform, binformat) ) + if success == 0 + puts "pass" + else + puts "FAIL! (#{success})" + end exit(success.to_i) end diff --git a/test/test_for.code b/test/test_for.code index ab33b73..f3291a2 100644 --- a/test/test_for.code +++ b/test/test_for.code @@ -1,7 +1,5 @@ -i=0 a=10 for i = 0 to 10 a=a-1 end a=a - \ No newline at end of file diff --git a/x86.txt b/x86.txt deleted file mode 100644 index 594ea9c..0000000 --- a/x86.txt +++ /dev/null @@ -1,11 +0,0 @@ -mov (0x66) { - reg32, reg32 (0x89) { - op2 - src - - eax ecx edx ebx - op1 eax c0 c8 d0 d8 - dest ecx c1 c9 d1 d9 - edx c2 ca d2 da - ebx c3 cb d3 db - } -}