diff --git a/asm/binary.rb b/asm/binary.rb index bb048ef..78efe81 100644 --- a/asm/binary.rb +++ b/asm/binary.rb @@ -4,8 +4,15 @@ # # sjs # may 2009 +# +# Refer to the Intel[1] or AMD documentationon on x86 for explanations +# of Mod-R/M encoding, the Scale-Index-Base (SIB) byte, opcode groups. +# +# The start and exit shell codes were obtained by disassembling +# minimal binaries on the respective platforms. require 'asm/asm' +require 'asm/varproxy' module Assembler @@ -25,8 +32,9 @@ module Assembler SignedInt = MinSigned..MaxSigned SignedByte = -128..127 - # This is used for encoding instructions. Just as the generated asm - # contains "BITS 32", binary is generated for 32-bit protected mode. + # This is used for encoding instructions. Just as the equivalent + # assembly would contain "BITS 32", binary is generated for 32-bit + # protected mode. DefaultOperandSize = :dword SizeMap = {:byte => 8, :word => 16, :dword => 32} @@ -50,63 +58,150 @@ module Assembler ] } - attr_reader :eip + attr_reader :ip - def initialize(platform, symtab, objwriter) + def initialize(platform, symtab, objwriter_class) super(platform) @symtab = symtab - @objwriter = objwriter - @binary = [] # Byte array of machine code. - @eip = 0 # Our instruction pointer, or the number of bytes written. + @objwriter_class = objwriter_class + # @objwriter = objwriter + # Almost a byte array, except for addresses. + # + # Addresses take the form [:, ] + # where is one of: var, const, or label + # + # NOTE the type is redundant because of VariableProxy#const? + # and labels are just strings. + # + # however, we could accept strings for variable names + # if we keep the type tag. something to think about. + @ir = [] + + # Our instruction pointer, or the number of bytes written. + @ip = 0 + + # Map locations in the byte array to var proxies so we can + # resolve address operations on the 2nd pass. + @proxies = {} + # Always include the _main entry point in our symbol table. It begins at the # beginning of the __TEXT segment, 0x0. - @symtab.deflabel('_main', @eip) + @symtab.deflabel('_main', @ip) + + X86_start[@platform].each {|byte| emit_byte(byte)} end def output - resolve_labels - blobs = X86_start[@platform] + @binary + X86_exit[@platform] - binary = blobs.pack('c*') - @objwriter.text(binary) - @objwriter.const(@symtab.const_data) - @objwriter.bss(@symtab.bss_size) - @objwriter.symtab(@symtab) - @objwriter.serialize + X86_exit[@platform].each {|byte| emit_byte(byte)} + + byte_array = resolve_labels + + #puts "1st pass: " + byte_array.inspect if DEBUG_OUTPUT + + binary = package(byte_array) + + @symtab.calculate_offsets(binary.length) + if DEBUG_OUTPUT + puts ">>> text offset: 0x#{@symtab.text_offset.to_s(16)}" + puts ">>> const offset: 0x#{@symtab.const_offset.to_s(16)}" + puts ">>> bss offset: 0x#{@symtab.bss_offset.to_s(16)}" + end + + # Now that we know where everything lies do the 2nd pass + # calculating and filling in final var and const addresses. + # + # outline: + # - resolve all variable proxies in @proxies replacing + # the 4 bytes (0xff) with the real address + + bss_offset = @symtab.bss_offset + const_offset = @symtab.const_offset + @proxies.each do |i, proxy| + #puts ">>> Resolving #{proxy.name}" if DEBUG_OUTPUT + var = @symtab.var(proxy.name) + base_addr = if proxy.const? + const_offset + @symtab.const(proxy.name) + else + bss_offset + @symtab.var(proxy.name) + end + #puts ">>> Replacing #{byte_array[i,4].map{|x|'0x' + x.to_s(16)}.inspect} with #{num_to_quad(proxy.resolve(base_addr)).map{|x|'0x' + x.to_s(16)}.inspect}" if DEBUG_OUTPUT + byte_array[i, 4] = num_to_quad(proxy.resolve(base_addr)) + end + + binary = package(byte_array) + + #puts "2nd pass: " + byte_array.inspect if DEBUG_OUTPUT + + objwriter = @objwriter_class.new + objwriter.text(binary) + objwriter.const(@symtab.const_data) if @symtab.const_size > 0 + objwriter.bss(@symtab.bss_size) if @symtab.bss_size > 0 + objwriter.reloc(@symtab.reloc_info) + objwriter.symtab(@symtab) + objwriter.serialize end def resolve_labels bytes_read = 0 - @binary.each_with_index do |x, i| + bytes = [] + @ir.each_with_index do |x, i| if x.is_a?(Numeric) + bytes << x bytes_read += 1 elsif addr?(x) - @binary[i, 1] = x[1..-1] - bytes_read += 1 + # remember this so we can replace the bogus addr later + @proxies[bytes_read] = x[1] - else # label to resolve + # add a relocation entry for this address + @symtab.reloc(bytes_read) + + # fill in said bogus addr + bytes += [0xff, 0xff, 0xff, 0xff] + + bytes_read += 4 + + + # TODO find out if we should calculate addrs as offsets rather than + # absolute as they are done now. (ok for Mach-O, maybe not ELF) + elsif label?(x) # the actual eip points to the next instruction already, so should we. - real_eip = bytes_read + 4 - addr = @symtab.lookup_label(x) - real_eip # dest - src to get relative addr - puts "resolved label: #{x} = 0x#{@symtab.lookup_label(x).to_s(16)} (rel: 0x#{addr.to_s(16)}, eip = 0x#{real_eip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT - @binary[i, 1] = num_to_quad(addr) - # count the first byte just written, the rest are counted normally - bytes_read += 1 + real_ip = bytes_read + 4 + name = x[1] + addr = @symtab.lookup_label(name) - real_ip # dest - src to get relative addr + #puts "resolved label: #{x} = 0x#{@symtab.lookup_label(name).to_s(16)} (rel: 0x#{addr.to_s(16)}, ip = 0x#{real_ip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT + + + bytes += num_to_quad(addr) + bytes_read += 4 + + else + raise "unknown value in the IR at #{bytes_read} - #{x.inspect}" end end + + return bytes end + def package(bytes) + bytes.pack('c*') + end + + # Silly semantics, but labels don't count as an address since they + # don't need to be deferred. def addr?(x) - x.is_a?(Array) && x[0] == :addr - end - - def addr_size(addr) - addr.length - 1 + x.is_a?(Array) && [:var, :const].include?(x[0]) + end + + def label?(x) + x.is_a?(Array) && x[0] == :label end + # XXX this should probably evaluate the value somehow def defconst(name, bytes, value) @symtab.defconst(name, bytes, value) + return const(name) end # Define a variable with the given name and size in bytes. @@ -116,27 +211,49 @@ module Assembler else STDERR.puts "[warning] attempted to redefine #{name}" end + return var(name) end - # These methods are all delegated to the symbol table. - %w[var var? const const?].each do |method| - define_method(method) do |name| - @symtab.send(method, name) + def var(name) + STDERR.puts "[error] undefined variable #{name}" unless var?(name) + # TODO bail on undefined vars + VariableProxy.new(name) + end + + def const(name) + STDERR.puts "[error] undefined variable #{name}" unless const?(name) + # TODO bail on undefined consts + VariableProxy.new(name, true) + end + + def var?(name) + @symtab.var?(name) + end + + def const?(name) + @symtab.const?(name) + end + + # Define a variable unless it exists. + def var!(name, bytes=4) + if var?(name) + var(name) + else + defvar(name, bytes) end end - # Count the bytes that were encoded in the given block. def asm # stash the current number of bytes written - instruction_offset = @eip + instruction_offset = @ip - print "0x#{@eip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT + print "0x#{@ip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT yield # return the number of bytes written - @eip - instruction_offset + @ip - instruction_offset puts if DEBUG_OUTPUT end @@ -160,26 +277,38 @@ module Assembler # make sure it's a byte raise "not a byte: #{byte.inspect}" unless byte == byte & 0xff - byte = byte & 0xff + byte = byte & 0xff ### end of pointless code print (byte >= 0 && byte < 0x10 ? '0' : '') + byte.to_s(16) + ' ' if DEBUG_OUTPUT - @binary << byte - @eip += 1 + @ir << byte + @ip += 1 end - def emit_addr(addr) - @eip += addr.length - addr.insert(0, :addr) - puts addr.inspect if DEBUG_OUTPUT - @binary << addr + # addresses are emited as arrays of bytes, prefixed with :var, :const, or :label + def emit_addr(type, name) + placeholder = [type, name] + puts placeholder.inspect if DEBUG_OUTPUT + @ir << placeholder + + # all addresses are 32-bits and jumps are all 32-bit relative + @ip += 4 end - def emit_future_addr(label) - print "<#{label}> " if DEBUG_OUTPUT - @binary << label - @eip += 4 # all jumps are 32-bit relative for now + def emit_var(name_or_proxy) + proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : var(name_or_proxy) + emit_addr(:var, proxy) + end + + def emit_const(name) + proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : const(name_or_proxy) + emit_addr(:const, proxy) + end + + def emit_label(name) + print "<#{name}> " if DEBUG_OUTPUT + emit_addr(:label, name) end def emit_dword(num) @@ -190,9 +319,9 @@ module Assembler @symtab.unique_label(suffix) end - def emit_label(name) - puts "\n#{name} (0x#{@eip.to_s(16)}):" if DEBUG_OUTPUT - @symtab.deflabel(name, @eip) + def deflabel(name) + puts "\n#{name} (0x#{@ip.to_s(16)}):" if DEBUG_OUTPUT + @symtab.deflabel(name, @ip) end def emit_modrm(addr, reg=0) @@ -201,12 +330,14 @@ module Assembler disp8 = nil disp32 = nil sib = nil + var = nil # variable proxy # effective address if addr.is_a?(Array) eff_addr = addr[1] || addr[0] # works with or without size prefix raise "invalid effective address: #{addr.inspect}" unless eff_addr case eff_addr + when RegisterProxy # Simple register addressing, e.g. [ESI]. @@ -266,6 +397,11 @@ module Assembler rm = 5 # 101 disp32 = eff_addr + when VariableProxy + mod = 0 + rm = 5 + var = eff_addr + else raise "unsupported effective address: #{addr.inspect}" end @@ -275,14 +411,22 @@ module Assembler mod = 3 rm = addr.regnum + # XXX TODO elsif addr.respond_to?(:name) + # (VariableProxy) => [:(var|const), addr.name] + # + # i.e. a pointer to that var + else raise "unsupported effective address: #{addr.inspect}" end emit_byte((mod << 6) | (reg << 3) | rm) emit_byte(sib) if sib - emit_addr([disp8]) if disp8 - emit_addr(num_to_quad(disp32)) if disp32 + + emit_byte(disp8) if disp8 + + emit_dword(disp32) if disp32 + emit_var(var) if var end @@ -311,12 +455,25 @@ module Assembler op.is_a?(Numeric) && op >= -(2 ** bits / 2) && op <= (2 ** bits - 1) end + # Return true if op is a valid operand of the specified size. + # (:byte, :word, :dword) + # + # Valid operands are: + # + # * registers + # + # * effective addresses (wrapped in an array to look like nasm code) + # + # XXX This method is pretty ugly. def rm?(op, size=DefaultOperandSize) - register?(op, size) || op.is_a?(Array) && (op.size == 1 || op[0] == size) + register?(op, size) || + (op.is_a?(Array) && + (op.size == 1 && [Numeric, RegisterProxy, VariableProxy].any?{|c| c == op[0].class}) || + (op.size == 2 && rm?(op[1]))) end def offset?(addr, size=DefaultOperandSize) - addr.is_a?(Array) && addr[0].is_a?(Numeric) + addr.is_a?(Array) && (addr[0].is_a?(Numeric) || addr[0].is_a?(VariableProxy)) end def constant?(op) @@ -382,7 +539,7 @@ module Assembler # This is an array of arguments to be passed to emit_modrm, if it is set. modrm = nil - + # version 1: mov r32, imm32 if register?(dest) && immediate?(src) opcode = 0xb8 + dest.regnum # dest encoded in instruction @@ -434,10 +591,20 @@ module Assembler raise "unsupported MOV instruction, #{dest.inspect}, #{src.inspect}" end + dword = immediate || offset + asm do emit_byte(opcode) emit_modrm(*modrm) if modrm - emit_dword(immediate || offset) if immediate || offset + if dword.is_a?(VariableProxy) + if dword.const? + emit_const(dword) + else + emit_var(dword) + end + elsif dword + emit_dword(dword) + end end end @@ -446,7 +613,7 @@ module Assembler # movzx Gv, ?? if register?(dest) - + opcode = case when rm?(src, :byte): 0xb6 # movzx Gv, Eb when rm?(src, :word): 0xb7 # movzx Gv, Ew @@ -742,7 +909,7 @@ module Assembler def jmp(label) asm do emit_byte(0xe9) - emit_future_addr(label) + emit_label(label) end end @@ -768,7 +935,7 @@ module Assembler asm do emit_byte(0x0f) emit_byte(opcode) - emit_future_addr(label) + emit_label(label) end end @@ -807,8 +974,8 @@ module Assembler # NOTE: LOOP only accepts a 1-byte signed offset. Don't use it. def loop_(label) - real_eip = @eip + 2 # loop instruction is 2 bytes - delta = @symtab.lookup_label(label) - real_eip + real_ip = @ip + 2 # loop instruction is 2 bytes + delta = @symtab.lookup_label(label) - real_ip unless SignedByte === delta raise "LOOP can only jump -128 to 127 bytes, #{label} is #{delta} bytes away" end diff --git a/asm/varproxy.rb b/asm/varproxy.rb new file mode 100644 index 0000000..6f25c0d --- /dev/null +++ b/asm/varproxy.rb @@ -0,0 +1,41 @@ +module Assembler + + # Wrap a variable's address so that we can perform arithmetic on it + # before resolving it when we know where things will go in memory. + # All we do is catch arithmetic ops and then provide a means to + # resolve a final addres by replaying them later. + # + # e.g. [symtab.var('i')] or [symtab.var('i') * 2] + class VariableProxy + + attr_reader :name + attr_accessor :ops + + def initialize(name, const=false) + @name = name + @const = const + @ops = [] + end + + %w[+ * / - % & |].each do |op| + define_method(op) do |*args| + new_proxy = self.class.new(@name, @const) + new_proxy.ops << [op, *args] + return new_proxy + end + end + + # XXX should this perhaps use the offset instead? + def resolve(base_addr) + @ops.inject(base_addr) do |addr, op| + addr.send(*op) + end + end + + def const? + @const + end + + end + +end diff --git a/build.rb b/build.rb index 24b454f..43695e7 100755 --- a/build.rb +++ b/build.rb @@ -87,14 +87,14 @@ end def build(filename, platform='linux', binformat='elf') objfile = base(filename) + '.o' - symtab, objwriter = + symtab, objwriter_class = case binformat - when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile.new] - when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile.new] + when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile] + when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile] else raise "unsupported binary format: #{binformat}" end - compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter)) + compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter_class)) exefile = link(objfile, platform) return exefile end diff --git a/compiler.rb b/compiler.rb index 0315f7f..b774534 100644 --- a/compiler.rb +++ b/compiler.rb @@ -13,6 +13,7 @@ # require 'unroller' require 'asm/registers' +require 'asm/varproxy' class ParseError < StandardError attr_reader :caller, :context @@ -34,22 +35,19 @@ class Compiler attr_reader :asm def initialize(input, asm) - # XXX for development only! @indent = 0 # for pretty printing - @look = '' # Next lookahead char. @token = nil # Type of last read token. @value = nil # Value of last read token. @input = input # Stream to read from. - - @asm = asm + @asm = asm # assembler # seed the lexer get_char end def compile - block + block # parse a block of code expected(:'end of file') unless eof? asm.output end @@ -267,7 +265,7 @@ class Compiler asm.cmp(reg, 0) # if false do nothing asm.jz(end_label) asm.mov(reg, -1) # truthy, make it true - asm.emit_label(end_label) + asm.deflabel(end_label) end def relation @@ -336,11 +334,11 @@ class Compiler asm.not_(EAX) if invert # (or true if inverted) asm.jmp(end_label) - asm.emit_label(true_label) + asm.deflabel(true_label) asm.xor(EAX, EAX) # return true asm.not_(EAX) unless invert # (or false if inverted) - asm.emit_label(end_label) + asm.deflabel(end_label) end # a: @@ -387,11 +385,14 @@ class Compiler name = @value match('=') boolean_expression - asm.defvar(name) unless asm.var?(name) - asm.mov([asm.var(name)], EAX) + lval = asm.var!(name) + asm.mov([lval], EAX) end # Parse a code block. + # + # TODO replace the case..when with a lookup table + # (might be exposed in the language later) def block(label=nil) scan until @value == 'else' || @value == 'end' || eof? @@ -438,13 +439,13 @@ class Compiler skip_any_whitespace end_label = asm.mklabel(:endif) # now we need the 2nd label asm.jmp(end_label) - asm.emit_label(else_label) + asm.deflabel(else_label) @indent += 1 block(label) @indent -= 1 end match_word('end') - asm.emit_label(end_label) + asm.deflabel(end_label) end # Used to implement the Two-Label-Loops (while, until, repeat). @@ -455,7 +456,7 @@ class Compiler def simple_loop(name) start_label = asm.mklabel(:"#{name}_loop") end_label = asm.mklabel(:"end_#{name}") - asm.emit_label(start_label) + asm.deflabel(start_label) yield(end_label) @@ -464,7 +465,7 @@ class Compiler @indent -= 1 match_word('end') asm.jmp(start_label) - asm.emit_label(end_label) + asm.deflabel(end_label) end def condition_loop(name, jump_instruction) @@ -494,13 +495,13 @@ class Compiler # s = s + x # e def for_stmt - counter = get_name - asm.defvar(counter) + name = get_name + counter = asm.defvar(name) match('=') boolean_expression # initial value asm.sub(EAX, 1) # pre-decrement because of the # following pre-increment - asm.mov([asm.var(counter)], EAX) # stash the counter in memory + asm.mov([counter], EAX) # stash the counter in memory match_word('to', :scan => true) boolean_expression # final value skip_any_whitespace @@ -508,9 +509,9 @@ class Compiler final = [ESP] simple_loop('for') do |end_label| - asm.mov(ECX, [asm.var(counter)]) # get the counter + asm.mov(ECX, [counter]) # get the counter asm.add(ECX, 1) # increment - asm.mov([asm.var(counter)], ECX) # store the counter + asm.mov([counter], ECX) # store the counter asm.cmp(final, ECX) # check if we're done asm.jz(end_label) # if so jump to the end end @@ -529,7 +530,7 @@ class Compiler start_label = asm.mklabel(:do) end_label = asm.mklabel(:enddo) - asm.emit_label(start_label) + asm.deflabel(start_label) asm.push(ECX) @@ -548,7 +549,7 @@ class Compiler # always clean up the stack after. asm.sub(ESP, 4) - asm.emit_label(end_label) + asm.deflabel(end_label) # If there was a break we have to clean up the stack here. If # there was no break we clean up the phony push above. @@ -573,35 +574,42 @@ class Compiler # print eax in hex format def print_stmt - # variable names - d = 'DIGITS' - h = 'HEX' + # variables + d = '__DIGITS' + h = '__HEX' + + digits = if asm.var?(d) + asm.var(d) + else + d_var = asm.defvar(d, 4) + asm.block do + # define a lookup table of digits + mov([d_var], 0x33323130) + mov([d_var+4], 0x37363534) + mov([d_var+8], 0x62613938) + mov([d_var+12], 0x66656463) + end + d_var + end + + # 3 dwords == 12 chars + hex = asm.var!(h, 3) asm.block do - # define a lookup table of digits - unless var?(d) - defvar(d, 4) - mov([var(d)], 0x33323130) - mov([var(d)+4], 0x37363534) - mov([var(d)+8], 0x62613938) - mov([var(d)+12], 0x66656463) - end - # 3 dwords == 12 chars - defvar(h, 3) unless var?(h) # TODO check sign and prepend '-' if negative - mov([var(h)], 0x7830) # "0x" == [48, 120] - mov([var(h)+10], 0xa) # newline + null terminator + mov([hex], 0x7830) # "0x" == [48, 120] + mov([hex+10], 0xa) # newline + null terminator end boolean_expression asm.block do # convert eax to a hex string - lea(ESI, [var(d)]) - lea(EDI, [var(h)+9]) + lea(ESI, [digits]) + lea(EDI, [hex+9]) # build the string backwards (right to left), byte by byte mov(ECX, 4) end - asm.emit_label(loop_label=asm.mklabel) asm.block do + deflabel(loop_label=mklabel) # low nybble of nth byte movzx(EBX, AL) and_(BL, 0x0f) # isolate low nybble @@ -619,7 +627,7 @@ class Compiler loop_(loop_label) # write(int fd, char *s, int n) mov(EAX, 4) # SYS_write - lea(ECX, [var(h)]) # ecx = &s + lea(ECX, [hex]) # ecx = &s args = [1, # fd = 1 (STDOUT) ECX, # s = &s 11] # n = 11 (excluding term, max # of chars to print)