[big commit] use variable proxies to defer address calculation

This commit is contained in:
Sami Samhuri 2010-01-19 23:23:54 -08:00
parent 19d79c8836
commit 0c21d1abc6
4 changed files with 325 additions and 109 deletions

View file

@ -4,8 +4,15 @@
#
# sjs
# may 2009
#
# Refer to the Intel[1] or AMD documentationon on x86 for explanations
# of Mod-R/M encoding, the Scale-Index-Base (SIB) byte, opcode groups.
#
# The start and exit shell codes were obtained by disassembling
# minimal binaries on the respective platforms.
require 'asm/asm'
require 'asm/varproxy'
module Assembler
@ -25,8 +32,9 @@ module Assembler
SignedInt = MinSigned..MaxSigned
SignedByte = -128..127
# This is used for encoding instructions. Just as the generated asm
# contains "BITS 32", binary is generated for 32-bit protected mode.
# This is used for encoding instructions. Just as the equivalent
# assembly would contain "BITS 32", binary is generated for 32-bit
# protected mode.
DefaultOperandSize = :dword
SizeMap = {:byte => 8, :word => 16, :dword => 32}
@ -50,63 +58,150 @@ module Assembler
]
}
attr_reader :eip
attr_reader :ip
def initialize(platform, symtab, objwriter)
def initialize(platform, symtab, objwriter_class)
super(platform)
@symtab = symtab
@objwriter = objwriter
@binary = [] # Byte array of machine code.
@eip = 0 # Our instruction pointer, or the number of bytes written.
@objwriter_class = objwriter_class
# @objwriter = objwriter
# Almost a byte array, except for addresses.
#
# Addresses take the form [:<type>, <name>]
# where <type> is one of: var, const, or label
#
# NOTE the type is redundant because of VariableProxy#const?
# and labels are just strings.
#
# however, we could accept strings for variable names
# if we keep the type tag. something to think about.
@ir = []
# Our instruction pointer, or the number of bytes written.
@ip = 0
# Map locations in the byte array to var proxies so we can
# resolve address operations on the 2nd pass.
@proxies = {}
# Always include the _main entry point in our symbol table. It begins at the
# beginning of the __TEXT segment, 0x0.
@symtab.deflabel('_main', @eip)
@symtab.deflabel('_main', @ip)
X86_start[@platform].each {|byte| emit_byte(byte)}
end
def output
resolve_labels
blobs = X86_start[@platform] + @binary + X86_exit[@platform]
binary = blobs.pack('c*')
@objwriter.text(binary)
@objwriter.const(@symtab.const_data)
@objwriter.bss(@symtab.bss_size)
@objwriter.symtab(@symtab)
@objwriter.serialize
X86_exit[@platform].each {|byte| emit_byte(byte)}
byte_array = resolve_labels
#puts "1st pass: " + byte_array.inspect if DEBUG_OUTPUT
binary = package(byte_array)
@symtab.calculate_offsets(binary.length)
if DEBUG_OUTPUT
puts ">>> text offset: 0x#{@symtab.text_offset.to_s(16)}"
puts ">>> const offset: 0x#{@symtab.const_offset.to_s(16)}"
puts ">>> bss offset: 0x#{@symtab.bss_offset.to_s(16)}"
end
# Now that we know where everything lies do the 2nd pass
# calculating and filling in final var and const addresses.
#
# outline:
# - resolve all variable proxies in @proxies replacing
# the 4 bytes (0xff) with the real address
bss_offset = @symtab.bss_offset
const_offset = @symtab.const_offset
@proxies.each do |i, proxy|
#puts ">>> Resolving #{proxy.name}" if DEBUG_OUTPUT
var = @symtab.var(proxy.name)
base_addr = if proxy.const?
const_offset + @symtab.const(proxy.name)
else
bss_offset + @symtab.var(proxy.name)
end
#puts ">>> Replacing #{byte_array[i,4].map{|x|'0x' + x.to_s(16)}.inspect} with #{num_to_quad(proxy.resolve(base_addr)).map{|x|'0x' + x.to_s(16)}.inspect}" if DEBUG_OUTPUT
byte_array[i, 4] = num_to_quad(proxy.resolve(base_addr))
end
binary = package(byte_array)
#puts "2nd pass: " + byte_array.inspect if DEBUG_OUTPUT
objwriter = @objwriter_class.new
objwriter.text(binary)
objwriter.const(@symtab.const_data) if @symtab.const_size > 0
objwriter.bss(@symtab.bss_size) if @symtab.bss_size > 0
objwriter.reloc(@symtab.reloc_info)
objwriter.symtab(@symtab)
objwriter.serialize
end
def resolve_labels
bytes_read = 0
@binary.each_with_index do |x, i|
bytes = []
@ir.each_with_index do |x, i|
if x.is_a?(Numeric)
bytes << x
bytes_read += 1
elsif addr?(x)
@binary[i, 1] = x[1..-1]
bytes_read += 1
# remember this so we can replace the bogus addr later
@proxies[bytes_read] = x[1]
else # label to resolve
# add a relocation entry for this address
@symtab.reloc(bytes_read)
# fill in said bogus addr
bytes += [0xff, 0xff, 0xff, 0xff]
bytes_read += 4
# TODO find out if we should calculate addrs as offsets rather than
# absolute as they are done now. (ok for Mach-O, maybe not ELF)
elsif label?(x)
# the actual eip points to the next instruction already, so should we.
real_eip = bytes_read + 4
addr = @symtab.lookup_label(x) - real_eip # dest - src to get relative addr
puts "resolved label: #{x} = 0x#{@symtab.lookup_label(x).to_s(16)} (rel: 0x#{addr.to_s(16)}, eip = 0x#{real_eip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT
@binary[i, 1] = num_to_quad(addr)
# count the first byte just written, the rest are counted normally
bytes_read += 1
real_ip = bytes_read + 4
name = x[1]
addr = @symtab.lookup_label(name) - real_ip # dest - src to get relative addr
#puts "resolved label: #{x} = 0x#{@symtab.lookup_label(name).to_s(16)} (rel: 0x#{addr.to_s(16)}, ip = 0x#{real_ip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT
bytes += num_to_quad(addr)
bytes_read += 4
else
raise "unknown value in the IR at #{bytes_read} - #{x.inspect}"
end
end
return bytes
end
def package(bytes)
bytes.pack('c*')
end
# Silly semantics, but labels don't count as an address since they
# don't need to be deferred.
def addr?(x)
x.is_a?(Array) && x[0] == :addr
end
def addr_size(addr)
addr.length - 1
x.is_a?(Array) && [:var, :const].include?(x[0])
end
def label?(x)
x.is_a?(Array) && x[0] == :label
end
# XXX this should probably evaluate the value somehow
def defconst(name, bytes, value)
@symtab.defconst(name, bytes, value)
return const(name)
end
# Define a variable with the given name and size in bytes.
@ -116,27 +211,49 @@ module Assembler
else
STDERR.puts "[warning] attempted to redefine #{name}"
end
return var(name)
end
# These methods are all delegated to the symbol table.
%w[var var? const const?].each do |method|
define_method(method) do |name|
@symtab.send(method, name)
def var(name)
STDERR.puts "[error] undefined variable #{name}" unless var?(name)
# TODO bail on undefined vars
VariableProxy.new(name)
end
def const(name)
STDERR.puts "[error] undefined variable #{name}" unless const?(name)
# TODO bail on undefined consts
VariableProxy.new(name, true)
end
def var?(name)
@symtab.var?(name)
end
def const?(name)
@symtab.const?(name)
end
# Define a variable unless it exists.
def var!(name, bytes=4)
if var?(name)
var(name)
else
defvar(name, bytes)
end
end
# Count the bytes that were encoded in the given block.
def asm
# stash the current number of bytes written
instruction_offset = @eip
instruction_offset = @ip
print "0x#{@eip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT
print "0x#{@ip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT
yield
# return the number of bytes written
@eip - instruction_offset
@ip - instruction_offset
puts if DEBUG_OUTPUT
end
@ -160,26 +277,38 @@ module Assembler
# make sure it's a byte
raise "not a byte: #{byte.inspect}" unless byte == byte & 0xff
byte = byte & 0xff
byte = byte & 0xff
### end of pointless code
print (byte >= 0 && byte < 0x10 ? '0' : '') + byte.to_s(16) + ' ' if DEBUG_OUTPUT
@binary << byte
@eip += 1
@ir << byte
@ip += 1
end
def emit_addr(addr)
@eip += addr.length
addr.insert(0, :addr)
puts addr.inspect if DEBUG_OUTPUT
@binary << addr
# addresses are emited as arrays of bytes, prefixed with :var, :const, or :label
def emit_addr(type, name)
placeholder = [type, name]
puts placeholder.inspect if DEBUG_OUTPUT
@ir << placeholder
# all addresses are 32-bits and jumps are all 32-bit relative
@ip += 4
end
def emit_future_addr(label)
print "<#{label}> " if DEBUG_OUTPUT
@binary << label
@eip += 4 # all jumps are 32-bit relative for now
def emit_var(name_or_proxy)
proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : var(name_or_proxy)
emit_addr(:var, proxy)
end
def emit_const(name)
proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : const(name_or_proxy)
emit_addr(:const, proxy)
end
def emit_label(name)
print "<#{name}> " if DEBUG_OUTPUT
emit_addr(:label, name)
end
def emit_dword(num)
@ -190,9 +319,9 @@ module Assembler
@symtab.unique_label(suffix)
end
def emit_label(name)
puts "\n#{name} (0x#{@eip.to_s(16)}):" if DEBUG_OUTPUT
@symtab.deflabel(name, @eip)
def deflabel(name)
puts "\n#{name} (0x#{@ip.to_s(16)}):" if DEBUG_OUTPUT
@symtab.deflabel(name, @ip)
end
def emit_modrm(addr, reg=0)
@ -201,12 +330,14 @@ module Assembler
disp8 = nil
disp32 = nil
sib = nil
var = nil # variable proxy
# effective address
if addr.is_a?(Array)
eff_addr = addr[1] || addr[0] # works with or without size prefix
raise "invalid effective address: #{addr.inspect}" unless eff_addr
case eff_addr
when RegisterProxy
# Simple register addressing, e.g. [ESI].
@ -266,6 +397,11 @@ module Assembler
rm = 5 # 101
disp32 = eff_addr
when VariableProxy
mod = 0
rm = 5
var = eff_addr
else
raise "unsupported effective address: #{addr.inspect}"
end
@ -275,14 +411,22 @@ module Assembler
mod = 3
rm = addr.regnum
# XXX TODO elsif addr.respond_to?(:name)
# (VariableProxy) => [:(var|const), addr.name]
#
# i.e. a pointer to that var
else
raise "unsupported effective address: #{addr.inspect}"
end
emit_byte((mod << 6) | (reg << 3) | rm)
emit_byte(sib) if sib
emit_addr([disp8]) if disp8
emit_addr(num_to_quad(disp32)) if disp32
emit_byte(disp8) if disp8
emit_dword(disp32) if disp32
emit_var(var) if var
end
@ -311,12 +455,25 @@ module Assembler
op.is_a?(Numeric) && op >= -(2 ** bits / 2) && op <= (2 ** bits - 1)
end
# Return true if op is a valid operand of the specified size.
# (:byte, :word, :dword)
#
# Valid operands are:
#
# * registers
#
# * effective addresses (wrapped in an array to look like nasm code)
#
# XXX This method is pretty ugly.
def rm?(op, size=DefaultOperandSize)
register?(op, size) || op.is_a?(Array) && (op.size == 1 || op[0] == size)
register?(op, size) ||
(op.is_a?(Array) &&
(op.size == 1 && [Numeric, RegisterProxy, VariableProxy].any?{|c| c == op[0].class}) ||
(op.size == 2 && rm?(op[1])))
end
def offset?(addr, size=DefaultOperandSize)
addr.is_a?(Array) && addr[0].is_a?(Numeric)
addr.is_a?(Array) && (addr[0].is_a?(Numeric) || addr[0].is_a?(VariableProxy))
end
def constant?(op)
@ -382,7 +539,7 @@ module Assembler
# This is an array of arguments to be passed to emit_modrm, if it is set.
modrm = nil
# version 1: mov r32, imm32
if register?(dest) && immediate?(src)
opcode = 0xb8 + dest.regnum # dest encoded in instruction
@ -434,10 +591,20 @@ module Assembler
raise "unsupported MOV instruction, #{dest.inspect}, #{src.inspect}"
end
dword = immediate || offset
asm do
emit_byte(opcode)
emit_modrm(*modrm) if modrm
emit_dword(immediate || offset) if immediate || offset
if dword.is_a?(VariableProxy)
if dword.const?
emit_const(dword)
else
emit_var(dword)
end
elsif dword
emit_dword(dword)
end
end
end
@ -446,7 +613,7 @@ module Assembler
# movzx Gv, ??
if register?(dest)
opcode = case
when rm?(src, :byte): 0xb6 # movzx Gv, Eb
when rm?(src, :word): 0xb7 # movzx Gv, Ew
@ -742,7 +909,7 @@ module Assembler
def jmp(label)
asm do
emit_byte(0xe9)
emit_future_addr(label)
emit_label(label)
end
end
@ -768,7 +935,7 @@ module Assembler
asm do
emit_byte(0x0f)
emit_byte(opcode)
emit_future_addr(label)
emit_label(label)
end
end
@ -807,8 +974,8 @@ module Assembler
# NOTE: LOOP only accepts a 1-byte signed offset. Don't use it.
def loop_(label)
real_eip = @eip + 2 # loop instruction is 2 bytes
delta = @symtab.lookup_label(label) - real_eip
real_ip = @ip + 2 # loop instruction is 2 bytes
delta = @symtab.lookup_label(label) - real_ip
unless SignedByte === delta
raise "LOOP can only jump -128 to 127 bytes, #{label} is #{delta} bytes away"
end

41
asm/varproxy.rb Normal file
View file

@ -0,0 +1,41 @@
module Assembler
# Wrap a variable's address so that we can perform arithmetic on it
# before resolving it when we know where things will go in memory.
# All we do is catch arithmetic ops and then provide a means to
# resolve a final addres by replaying them later.
#
# e.g. [symtab.var('i')] or [symtab.var('i') * 2]
class VariableProxy
attr_reader :name
attr_accessor :ops
def initialize(name, const=false)
@name = name
@const = const
@ops = []
end
%w[+ * / - % & |].each do |op|
define_method(op) do |*args|
new_proxy = self.class.new(@name, @const)
new_proxy.ops << [op, *args]
return new_proxy
end
end
# XXX should this perhaps use the offset instead?
def resolve(base_addr)
@ops.inject(base_addr) do |addr, op|
addr.send(*op)
end
end
def const?
@const
end
end
end

View file

@ -87,14 +87,14 @@ end
def build(filename, platform='linux', binformat='elf')
objfile = base(filename) + '.o'
symtab, objwriter =
symtab, objwriter_class =
case binformat
when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile.new]
when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile.new]
when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile]
when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile]
else
raise "unsupported binary format: #{binformat}"
end
compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter))
compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter_class))
exefile = link(objfile, platform)
return exefile
end

View file

@ -13,6 +13,7 @@
# require 'unroller'
require 'asm/registers'
require 'asm/varproxy'
class ParseError < StandardError
attr_reader :caller, :context
@ -34,22 +35,19 @@ class Compiler
attr_reader :asm
def initialize(input, asm)
# XXX for development only!
@indent = 0 # for pretty printing
@look = '' # Next lookahead char.
@token = nil # Type of last read token.
@value = nil # Value of last read token.
@input = input # Stream to read from.
@asm = asm
@asm = asm # assembler
# seed the lexer
get_char
end
def compile
block
block # parse a block of code
expected(:'end of file') unless eof?
asm.output
end
@ -267,7 +265,7 @@ class Compiler
asm.cmp(reg, 0) # if false do nothing
asm.jz(end_label)
asm.mov(reg, -1) # truthy, make it true
asm.emit_label(end_label)
asm.deflabel(end_label)
end
def relation
@ -336,11 +334,11 @@ class Compiler
asm.not_(EAX) if invert # (or true if inverted)
asm.jmp(end_label)
asm.emit_label(true_label)
asm.deflabel(true_label)
asm.xor(EAX, EAX) # return true
asm.not_(EAX) unless invert # (or false if inverted)
asm.emit_label(end_label)
asm.deflabel(end_label)
end
# a: <on the stack>
@ -387,11 +385,14 @@ class Compiler
name = @value
match('=')
boolean_expression
asm.defvar(name) unless asm.var?(name)
asm.mov([asm.var(name)], EAX)
lval = asm.var!(name)
asm.mov([lval], EAX)
end
# Parse a code block.
#
# TODO replace the case..when with a lookup table
# (might be exposed in the language later)
def block(label=nil)
scan
until @value == 'else' || @value == 'end' || eof?
@ -438,13 +439,13 @@ class Compiler
skip_any_whitespace
end_label = asm.mklabel(:endif) # now we need the 2nd label
asm.jmp(end_label)
asm.emit_label(else_label)
asm.deflabel(else_label)
@indent += 1
block(label)
@indent -= 1
end
match_word('end')
asm.emit_label(end_label)
asm.deflabel(end_label)
end
# Used to implement the Two-Label-Loops (while, until, repeat).
@ -455,7 +456,7 @@ class Compiler
def simple_loop(name)
start_label = asm.mklabel(:"#{name}_loop")
end_label = asm.mklabel(:"end_#{name}")
asm.emit_label(start_label)
asm.deflabel(start_label)
yield(end_label)
@ -464,7 +465,7 @@ class Compiler
@indent -= 1
match_word('end')
asm.jmp(start_label)
asm.emit_label(end_label)
asm.deflabel(end_label)
end
def condition_loop(name, jump_instruction)
@ -494,13 +495,13 @@ class Compiler
# s = s + x
# e
def for_stmt
counter = get_name
asm.defvar(counter)
name = get_name
counter = asm.defvar(name)
match('=')
boolean_expression # initial value
asm.sub(EAX, 1) # pre-decrement because of the
# following pre-increment
asm.mov([asm.var(counter)], EAX) # stash the counter in memory
asm.mov([counter], EAX) # stash the counter in memory
match_word('to', :scan => true)
boolean_expression # final value
skip_any_whitespace
@ -508,9 +509,9 @@ class Compiler
final = [ESP]
simple_loop('for') do |end_label|
asm.mov(ECX, [asm.var(counter)]) # get the counter
asm.mov(ECX, [counter]) # get the counter
asm.add(ECX, 1) # increment
asm.mov([asm.var(counter)], ECX) # store the counter
asm.mov([counter], ECX) # store the counter
asm.cmp(final, ECX) # check if we're done
asm.jz(end_label) # if so jump to the end
end
@ -529,7 +530,7 @@ class Compiler
start_label = asm.mklabel(:do)
end_label = asm.mklabel(:enddo)
asm.emit_label(start_label)
asm.deflabel(start_label)
asm.push(ECX)
@ -548,7 +549,7 @@ class Compiler
# always clean up the stack after.
asm.sub(ESP, 4)
asm.emit_label(end_label)
asm.deflabel(end_label)
# If there was a break we have to clean up the stack here. If
# there was no break we clean up the phony push above.
@ -573,35 +574,42 @@ class Compiler
# print eax in hex format
def print_stmt
# variable names
d = 'DIGITS'
h = 'HEX'
# variables
d = '__DIGITS'
h = '__HEX'
digits = if asm.var?(d)
asm.var(d)
else
d_var = asm.defvar(d, 4)
asm.block do
# define a lookup table of digits
mov([d_var], 0x33323130)
mov([d_var+4], 0x37363534)
mov([d_var+8], 0x62613938)
mov([d_var+12], 0x66656463)
end
d_var
end
# 3 dwords == 12 chars
hex = asm.var!(h, 3)
asm.block do
# define a lookup table of digits
unless var?(d)
defvar(d, 4)
mov([var(d)], 0x33323130)
mov([var(d)+4], 0x37363534)
mov([var(d)+8], 0x62613938)
mov([var(d)+12], 0x66656463)
end
# 3 dwords == 12 chars
defvar(h, 3) unless var?(h)
# TODO check sign and prepend '-' if negative
mov([var(h)], 0x7830) # "0x" == [48, 120]
mov([var(h)+10], 0xa) # newline + null terminator
mov([hex], 0x7830) # "0x" == [48, 120]
mov([hex+10], 0xa) # newline + null terminator
end
boolean_expression
asm.block do
# convert eax to a hex string
lea(ESI, [var(d)])
lea(EDI, [var(h)+9])
lea(ESI, [digits])
lea(EDI, [hex+9])
# build the string backwards (right to left), byte by byte
mov(ECX, 4)
end
asm.emit_label(loop_label=asm.mklabel)
asm.block do
deflabel(loop_label=mklabel)
# low nybble of nth byte
movzx(EBX, AL)
and_(BL, 0x0f) # isolate low nybble
@ -619,7 +627,7 @@ class Compiler
loop_(loop_label)
# write(int fd, char *s, int n)
mov(EAX, 4) # SYS_write
lea(ECX, [var(h)]) # ecx = &s
lea(ECX, [hex]) # ecx = &s
args = [1, # fd = 1 (STDOUT)
ECX, # s = &s
11] # n = 11 (excluding term, max # of chars to print)