[NEW] First hints of cross-platform support. Compiles to Mach-O on Darwin with nasm and gcc.

There is no binary assembler support for Darwin yet! I'm not sure when I will dive into the details
of generating a Mach-O binary from Ruby or C.

[MERGED] Binary assembler support.  It *should* work on ELF but it needs testing on Linux.
This commit is contained in:
Sami Samhuri 2009-05-25 16:26:21 -07:00
parent 76d4d2be3a
commit a4506bab10
16 changed files with 437 additions and 407 deletions

View file

@ -6,4 +6,46 @@
# may 2009 # may 2009
module Assembler module Assembler
# Abstract class for common functionality between different code
# generators. Also defines somewhat of an interface that must be
# implemented to be useful.
class AssemblerBase
def initialize(*args)
@vars = {} # Symbol table, maps names to locations in BSS.
@num_labels = 0 # Used to generate unique labels.
@num_labels_with_suffix = Hash.new(0)
# Maps names to locations.
@labels = Hash.new {|h, key| raise "undefined label: #{key}"}
end
def block(*args, &blk)
instance_eval(&blk)
end
def output
raise "#{self.class} is supposed to implement this method!"
end
def var(name)
@vars[name]
end
alias_method :var?, :var
# Generate a unique label.
def label(suffix=nil)
@num_labels += 1
if suffix
@num_labels_with_suffix[suffix] += 1
suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
end
name = "L#{sprintf "%06d", @num_labels}#{suffix}"
return name
end
end
end end

View file

@ -5,13 +5,18 @@
# sjs # sjs
# may 2009 # may 2009
ROOT = __FILE__.sub(/\/asm\/binary\.rb$/, '') unless defined? ROOT
$LOAD_PATH << ROOT unless $LOAD_PATH.include?(ROOT)
require 'asm/asm'
module Assembler module Assembler
# Define a method named `emit_byte` and one named `binary_size` and # Define a method named `emit_byte` and one named `binary_size` and
# include this module. Calling the assembler methods will output # include this module. Calling the assembler methods will output
# x86 machine code ... hopefully. So far it's incomplete and # x86 machine code ... hopefully. So far it's incomplete and
# binaries just segfault. # binaries just segfault.
class Binary class Binary < AssemblerBase
# This structure allows for x86 registers of all sizes. The # This structure allows for x86 registers of all sizes. The
# number of the register is the index of the array in which it was # number of the register is the index of the array in which it was
@ -44,7 +49,72 @@ module Assembler
MaxUnsigned = 2**MachineBits - 1 MaxUnsigned = 2**MachineBits - 1
SignedRange = MinSigned..MaxSigned SignedRange = MinSigned..MaxSigned
X86_exit = {
'linux' => [0x89, 0xc3, # mov ebx, eax (exit code)
0xb8, 1, 0, 0, 0, # mov eax, 1
0xcd, 0x80 # int 0x80
].pack('c*'),
'darwin' => [0x50, # push eax (exit code)
0xb8, 1, 0, 0, 0, # mov eax, 1
0xcd, 0x80 # int 0x80
].pack('c*')
}
def initialize(platform='linux', binformat='elf')
super
@binary = [] # Byte array of machine code.
@platform = platform
@binformat = binformat
init_sections
end
def init_sections
case @platform
when 'linux'
raise "unsupported" unless @binformat == 'elf'
@header_size = 0x100 # ELF, Linux
@text_offset = 0x08048000 + @header_size # Offset of text section in memory
when 'darwin'
raise "unsupported" unless @binformat == 'macho'
@header_size = 0x100 # Mach-O, Darwin
@text_offset = 0x08048000 + @header_size # Offset of text section in memory
else
raise "unsupported platform: #{platform}"
end
@text_size = 0x02be00 # Size of text section.
@data_offset = @text_offset + @text_size # Offset of data section.
@data_size = 0x4e00 # Size of data section.
@bss_offset = @data_offset + @data_size # Offset of bss section.
@bss_size = 0 # Size of bss section.
end
def output
@binary.pack('c*') + X86_exit[@platform]
end
# Define a constant in the .data section.
def const(name, value)
raise "unimplemented!"
end
# Define a variable with the given name and size (in dwords).
def defvar(name, dwords=1)
unless var?(name)
@vars[name] = @bss_size
@bss_size += dwords
else
STDERR.puts "[warning] attempted to redefine #{name}"
end
end
def label(suffix=nil)
name = super
@labels[name] = bytes_written
return name
end
# Count the bytes that were encoded in the given block. # Count the bytes that were encoded in the given block.
def asm def asm
@ -56,7 +126,18 @@ module Assembler
# return the number of bytes written # return the number of bytes written
bytes_written - instruction_offset bytes_written - instruction_offset
end end
def emit_byte(byte)
@binary << byte
end
def bytes_written
@binary.size
end
def emit_label(name=label)
@labels[name] = @binary.length
end
def emit_dword(num) def emit_dword(num)
num_to_quad(num).each {|byte| emit_byte(byte)} num_to_quad(num).each {|byte| emit_byte(byte)}
@ -119,7 +200,7 @@ module Assembler
end end
def regnum(op) def regnum(op)
num = register? num = register?(op)
raise "not a register: #{op.inspect}" unless num raise "not a register: #{op.inspect}" unless num
num num
end end
@ -146,7 +227,7 @@ module Assembler
# 3. mov r/m32, reg32 (0x89, mod r/m, maybe sib) # 3. mov r/m32, reg32 (0x89, mod r/m, maybe sib)
# 3a. mov memoffset32, eax (0xa3, disp32) # 3a. mov memoffset32, eax (0xa3, disp32)
# 4. mov r/m32, immediate32 (0xc7, mod r/m, maybe sib, imm32) # 4. mov r/m32, immediate32 (0xc7, mod r/m, maybe sib, imm32)
def x86_mov(dest, src) def mov(dest, src)
dest = dest[6..-1] if dest.is_a?(String) && dest[0..5] == 'dword ' dest = dest[6..-1] if dest.is_a?(String) && dest[0..5] == 'dword '
src = src[6..-1] if src.is_a?(String) && src[0..5] == 'dword ' src = src[6..-1] if src.is_a?(String) && src[0..5] == 'dword '
@ -199,35 +280,35 @@ module Assembler
end end
def x86_add(dest, src) def add(dest, src)
end end
def x86_sub(dest, src) def sub(dest, src)
end end
def x86_imul(op) def imul(op)
end end
def x86_idiv(op) def idiv(op)
end end
def x86_inc(op) def inc(op)
asm do asm do
if register?(op) if register?(op)
emit_byte(0x40 + regnum(op)) emit_byte(0x40 + regnum(op))
elsif rm32?(op) elsif rm32?(op)
emit_byte(0xff) emit_byte(0xff)
emit_modrm(...) # emit_modrm(...)
else else
raise "unsupported op #{op}, wanted r32 or r/m32" raise "unsupported op #{op}, wanted r32 or r/m32"
end end
end end
end end
def x86_push(reg) def push(reg)
end end
def x86_cmp(a, b) def cmp(a, b)
end end

View file

@ -3,123 +3,166 @@
# sjs # sjs
# may 2009 # may 2009
ROOT = __FILE__.sub(/\/asm\/text\.rb$/, '') unless defined? ROOT
$LOAD_PATH << ROOT unless $LOAD_PATH.include?(ROOT)
require 'asm/asm'
module Assembler module Assembler
# Define a method named `emit` and include this module. Calling # Assembler methods output nasm-friendly x86 asm code, line by
# the assembler methods will output nasm-friendly x86 asm code, # line. This is dead easy and we can trust nasm to compile
# line by line. This is dead easy and we can trust nasm to # correct machine code, which isn't trivial.
# compile correct machine code, which is tricky. class Text < AssemblerBase
module Text
def self.included(other) def initialize(platform='linux')
im = other.instance_methods super
unless im.include?(:emit) @data = ''
raise "#{self.name} requires the including class define the emit method" @bss = ''
@code = ''
@templatefile = "#{ROOT}/template.#{platform}.asm"
raise "unsupported platform: #{platform}" unless File.readable?(@templatefile)
end
# Define a constant in the .data section.
def const(name, value)
@data << "#{name}\tequ #{value}"
end
# Define a variable with the given name and size (in dwords).
def defvar(name, dwords=1)
unless var?(name)
@bss << "#{name}: resd #{dwords}\n"
@vars[name] = name
else
STDERR.puts "[warning] attempted to redefine #{name}"
end end
end end
# Emit a line of code wrapped between a tab and a newline.
def x86_mov(dest, src) def emit(code, options={})
emit("mov #{dest}, #{src.is_a?(Numeric) ? "0x#{src.to_s(16)}" : src}") tab = options.has_key?(:tab) ? options[:tab] : "\t"
@code << "#{tab}#{code}\n"
end end
def x86_movzx(dest, src) def label(suffix=nil)
name = super
@labels[name] = name
return name
end
def output
File.read(@templatefile).
sub("{data}", @data).
sub("{bss}", @bss).
sub("{code}", @code)
end
def emit_label(name=label)
emit("#{name}:", :tab => nil)
end
def mov(dest, src)
emit("mov #{dest}, #{src}#{src.is_a?(Numeric) ? " ; 0x#{src.to_s(16)}" : ''}")
end
def movzx(dest, src)
emit("movzx #{dest}, #{src}") emit("movzx #{dest}, #{src}")
end end
def x86_add(dest, src) def add(dest, src)
emit("add #{dest}, #{src}") emit("add #{dest}, #{src}")
end end
def x86_sub(dest, src) def sub(dest, src)
emit("sub #{dest}, #{src}") emit("sub #{dest}, #{src}")
end end
def x86_imul(op) def imul(op)
emit("imul #{op}") emit("imul #{op}")
end end
def x86_idiv(op) def idiv(op)
emit("idiv #{op}") emit("idiv #{op}")
end end
def x86_inc(op) def inc(op)
emit("inc #{op}") emit("inc #{op}")
end end
def x86_dec(op) def dec(op)
emit("dec #{op}") emit("dec #{op}")
end end
def x86_push(reg) def push(reg)
emit("push #{reg}") emit("push #{reg}")
end end
def x86_pop(reg) def pop(reg)
emit("pop #{reg}") emit("pop #{reg}")
end end
def x86_call(label) def call(label)
emit("call #{label}") emit("call #{label}")
end end
def x86_neg(reg) def neg(reg)
emit("neg #{reg}") emit("neg #{reg}")
end end
def x86_not(rm32) def not(rm32)
emit("not #{rm32}") emit("not #{rm32}")
end end
def x86_xchg(op1, op2) def xchg(op1, op2)
emit("xchg #{op1}, #{op2}") emit("xchg #{op1}, #{op2}")
end end
def x86_and(op1, op2) def and_(op1, op2)
emit("and #{op1}, #{op2}") emit("and #{op1}, #{op2}")
end end
def x86_or(op1, op2) def or(op1, op2)
emit("or #{op1}, #{op2}") emit("or #{op1}, #{op2}")
end end
def x86_xor(op1, op2) def xor(op1, op2)
emit("xor #{op1}, #{op2}") emit("xor #{op1}, #{op2}")
end end
def x86_jz(label) def jz(label)
emit("jz #{label}") emit("jz #{label}")
end end
def x86_jnz(label) def jnz(label)
emit("jnz #{label}") emit("jnz #{label}")
end end
def x86_jmp(label) def jmp(label)
emit("jmp #{label}") emit("jmp #{label}")
end end
def x86_jl(label) def jl(label)
emit("jl #{label}") emit("jl #{label}")
end end
def x86_cmp(a, b) def cmp(a, b)
emit("cmp #{a}, #{b}") emit("cmp #{a}, #{b}")
end end
def x86_lea(a, b) def lea(a, b)
emit("lea #{a}, #{b}") emit("lea #{a}, #{b}")
end end
def x86_shr(a, b) def shr(a, b)
emit("shr #{a}, #{b}") emit("shr #{a}, #{b}")
end end
def x86_loop(label) def loop_(label)
emit("loop #{label}") emit("loop #{label}")
end end
def x86_int(num) def int(num)
emit("int 0x#{num.to_s(16)}") emit("int 0x#{num.to_s(16)}")
end end

View file

@ -1,14 +1,8 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
ROOT = __FILE__.sub(/\/build\.rb$/, '') unless defined? ROOT
require 'compiler' require 'compiler'
require 'asm/text'
require 'asm/binary'
X86_exit = [0x89, 0xc3, # mov ebx, eax (exit code)
0xb8, 1, 0, 0, 0, # mov eax, 1
0xcd, 0x80 # int 0x80
].pack('c*')
def main def main
filename = ARGV[0].to_s filename = ARGV[0].to_s
@ -23,39 +17,21 @@ def base(filename)
filename.sub(/\.[^.]*$/, '') filename.sub(/\.[^.]*$/, '')
end end
def interpolate(templatefile, data)
template = File.read(templatefile)
data.inject(template) do |template, mapping|
token, replacement = *mapping
template.sub("{#{token}}", replacement)
end
end
# filename: input filename # filename: input filename
# format: output format, nasm or binary # asm: assembler to use
# returns: output filename # returns: output filename
def compile(filename, format='asm') def compile(filename, asm, binformat='elf')
# compile to asm or binary
output = nil
File.open(filename, 'r') do |input| File.open(filename, 'r') do |input|
compiler = Compiler.new(input, format) compiler = Compiler.new(input, asm, binformat)
output = compiler.compile compiler.compile
end end
if format == 'asm'
mode = 'w' ext = asm.class.name.split('::').last[0,3].downcase == 'bin' ? 'bin' : 'asm'
data, bss, code = *output outfile = "#{base(filename)}.#{ext}"
output = interpolate("#{ROOT}/template.asm", File.open(outfile, 'wb') do |out|
:data => data, :bss => bss, :code => code) out.puts(asm.output)
else
mode = 'wb'
output += X86_exit
end
outfile = "#{base(filename)}.#{format}"
File.open(outfile, mode) do |out|
if format == 'asm'
out.puts(output)
end
end end
return outfile return outfile
@ -68,33 +44,47 @@ rescue ParseError => e
end end
# assemble using nasm, return resulting filename. # assemble using nasm, return resulting filename.
def asm(filename) def asm(filename, binformat='elf')
f = base(filename) f = base(filename)
outfile = "#{f}.o" outfile = "#{f}.o"
output = `nasm -f elf -g -o #{outfile} #{filename}` output = `nasm -f #{binformat} -g -o #{outfile} #{filename}`
if $?.exitstatus != 0 if $?.exitstatus != 0
raise "nasm failed: #{$?.exitstatus}", output puts output
raise "nasm failed: #{$?.exitstatus}"
end end
return outfile return outfile
end end
# link with ld, return resulting filename. # link with ld, return resulting filename.
def link(filename) def link(filename, platform='linux')
f = base(filename) f = base(filename)
output = `ld -o #{f} #{filename}` cmd, args = *case platform
when 'darwin': ['gcc', '-arch i386']
when 'linux': ['ld', '']
else
raise "unsupported platform: #{platform}"
end
output = `#{cmd} #{args} -o #{f} #{filename}`
if $?.exitstatus != 0 if $?.exitstatus != 0
raise "ld failed: #{$?.exitstatus}", output puts output
raise "ld failed: #{$?.exitstatus}"
end end
`chmod +x #{f}` `chmod +x #{f}`
return f return f
end end
def build(filename, format='asm') # TODO Use a dependency injection framework for the assembler, and
if format == 'asm' # other parts as things become more modular.
link( asm( compile(filename) ) ) def build(filename, platform='linux', format='asm', binformat='elf')
else # binary bin = if format == 'asm'
link( compile(filename, format='bin') ) code = compile(filename, Assembler::Text.new(platform))
end obj = asm( code, binformat )
link( obj, platform )
else # binary
obj = compile(filename, Assembler::Binary.new(platform), binformat)
link( obj, platform )
end
return bin
end end
def run(filename) def run(filename)

View file

@ -12,9 +12,6 @@
# require 'rubygems' # require 'rubygems'
# require 'unroller' # require 'unroller'
require 'asm'
require 'opcode'
class ParseError < StandardError class ParseError < StandardError
attr_reader :caller, :context attr_reader :caller, :context
def initialize(caller, context=nil) def initialize(caller, context=nil)
@ -24,43 +21,27 @@ class ParseError < StandardError
end end
class Compiler class Compiler
# This module uses our `emit_byte` method to output x86 machine code
# directly using the assembler library.
# include Assembler::Binary
Keywords = %w[ Keywords = %w[
if else end while until repeat for to do break if else end while until repeat for to do break
print print
] ]
attr_reader :data, :bss, :code attr_reader :asm
def initialize(input, asm=Assembler::Text.new) def initialize(input, asm, binformat='elf')
# XXX for development only! # XXX for development only!
@indent = 0 # for pretty printing @indent = 0 # for pretty printing
# The only binary format our assembler knows right now is ELF.
unless binformat == 'elf'
raise "Only ELF is supported. Unsupported binary format: #{binformat}."
end
@look = '' # Next lookahead char. @look = '' # Next lookahead char.
@token = nil # Type of last read token. @token = nil # Type of last read token.
@value = nil # Value of last read token. @value = nil # Value of last read token.
@input = input # Stream to read from. @input = input # Stream to read from.
@data = '' # Data section.
@bss = '' # BSS section.
@code = '' # Code section.
@binary = [] # Byte array of machine code.
@vars = {} # Symbol table, maps names to locations in BSS.
@num_labels = 0 # Used to generate unique labels.
@num_labels_with_suffix = Hash.new(0)
@header_size = 0x100 # ELF, Linux, x86
@text_offset = 0x08048000 + @header_size # Offset of text section in memory (Linux, x86).
@text_size = 0x02be00 # Size of text section.
@data_offset = @text_offset + @text_size # Offset of data section.
@data_size = 0x4e00 # Size of data section.
@bss_offset = @data_offset + @data_size # Offset of bss section.
@bss_size = 0 # Size of bss section.
# Labels for the assembler. Maps names to locations.
@labels = Hash.new {|h, key| raise "undefined label: #{key}"}
@asm = asm @asm = asm
@ -68,10 +49,6 @@ class Compiler
get_char get_char
end end
def asm
@asm
end
def compile def compile
block block
expected(:'end of file') unless eof? expected(:'end of file') unless eof?
@ -107,10 +84,10 @@ class Compiler
match('(') match('(')
# TODO arg list # TODO arg list
match(')') match(')')
x86_call(name) asm.call(name)
else else
# variable access # variable access
x86_mov(:eax, "dword [#{name}]") asm.mov(:eax, "dword [#{name}]")
end end
end end
@ -123,7 +100,7 @@ class Compiler
elsif alpha?(@look) elsif alpha?(@look)
identifier # or call identifier # or call
elsif digit?(@look) elsif digit?(@look)
x86_mov(:eax, get_number.to_i) asm.mov(:eax, get_number.to_i)
else else
expected(:'integer, identifier, function call, or parenthesized expression', :got => @look) expected(:'integer, identifier, function call, or parenthesized expression', :got => @look)
end end
@ -134,7 +111,7 @@ class Compiler
sign = @look sign = @look
match(sign) if op?(:unary, sign) match(sign) if op?(:unary, sign)
factor factor
x86_neg(:eax) if sign == '-' asm.neg(:eax) if sign == '-'
end end
# Parse and translate a single term (factor or mulop). Result is in # Parse and translate a single term (factor or mulop). Result is in
@ -172,7 +149,7 @@ class Compiler
def add def add
match('+') match('+')
term # Result is in eax. term # Result is in eax.
x86_add(:eax, '[esp]') # Add a to b. asm.add(:eax, '[esp]') # Add a to b.
end end
# Parse a subtraction operator and the 2nd term (b). The result is # Parse a subtraction operator and the 2nd term (b). The result is
@ -180,8 +157,8 @@ class Compiler
def subtract def subtract
match('-') match('-')
term # Result, b, is in eax. term # Result, b, is in eax.
x86_neg(:eax) # Fake the subtraction. a - b == a + -b asm.neg(:eax) # Fake the subtraction. a - b == a + -b
x86_add(:eax, '[esp]') # Add a and -b. asm.add(:eax, '[esp]') # Add a and -b.
end end
# Parse an addition operator and the 2nd term (b). The result is # Parse an addition operator and the 2nd term (b). The result is
@ -189,7 +166,7 @@ class Compiler
def multiply def multiply
match('*') match('*')
signed_factor # Result is in eax. signed_factor # Result is in eax.
x86_imul('dword [esp]') # Multiply a by b. asm.imul('dword [esp]') # Multiply a by b.
end end
# Parse a division operator and the divisor (b). The result is # Parse a division operator and the divisor (b). The result is
@ -197,14 +174,14 @@ class Compiler
def divide def divide
match('/') match('/')
signed_factor # Result is in eax. signed_factor # Result is in eax.
x86_xchg(:eax, '[esp]') # Swap the divisor and dividend into asm.xchg(:eax, '[esp]') # Swap the divisor and dividend into
# the correct places. # the correct places.
# idiv uses edx:eax as the dividend so we need to ensure that edx # idiv uses edx:eax as the dividend so we need to ensure that edx
# is correctly sign-extended w.r.t. eax. # is correctly sign-extended w.r.t. eax.
emit('cdq') # Sign-extend eax into edx (Convert Double to asm.cdq # Sign-extend eax into edx (Convert Double to
# Quad). # Quad).
x86_idiv('dword [esp]') # Divide a (eax) by b ([esp]). asm.idiv('dword [esp]') # Divide a (eax) by b ([esp]).
end end
@ -215,19 +192,19 @@ class Compiler
def bitor_expr def bitor_expr
match('|') match('|')
term term
x86_or(:eax, '[esp]') asm.or(:eax, '[esp]')
end end
def bitand_expr def bitand_expr
match('&') match('&')
signed_factor signed_factor
x86_and(:eax, '[esp]') asm.and_(:eax, '[esp]')
end end
def xor_expr def xor_expr
match('^') match('^')
term term
x86_xor(:eax, '[esp]') asm.xor(:eax, '[esp]')
end end
@ -240,6 +217,7 @@ class Compiler
while @look == '|' while @look == '|'
op '||' do op '||' do
boolean_term boolean_term
# !!! this method has moved, IMPLEMENT THIS!
emit("<logical or>") emit("<logical or>")
end end
end end
@ -250,6 +228,7 @@ class Compiler
while @look == '&' while @look == '&'
op '&&' do op '&&' do
not_factor not_factor
# !!! this method has moved, IMPLEMENT THIS!
emit("<logical and>") emit("<logical and>")
end end
end end
@ -258,9 +237,9 @@ class Compiler
def boolean_factor def boolean_factor
if boolean?(@look) if boolean?(@look)
if get_boolean == 'true' if get_boolean == 'true'
x86_mov(:eax, -1) asm.mov(:eax, -1)
else else
x86_xor(:eax, :eax) asm.xor(:eax, :eax)
end end
scan scan
else else
@ -273,7 +252,7 @@ class Compiler
match('!') match('!')
boolean_factor boolean_factor
make_boolean(:eax) # ensure it is -1 or 0... make_boolean(:eax) # ensure it is -1 or 0...
x86_not(:eax) # so that not is also boolean not asm.not(:eax) # so that not is also boolean not
else else
boolean_factor boolean_factor
end end
@ -282,11 +261,11 @@ class Compiler
# Convert any identifier to a boolean (-1 or 0). This is # Convert any identifier to a boolean (-1 or 0). This is
# semantically equivalent to !!reg in C or Ruby. # semantically equivalent to !!reg in C or Ruby.
def make_boolean(reg=:eax) def make_boolean(reg=:eax)
end_label = unique_label(:endmakebool) end_label = asm.label(:endmakebool)
x86_cmp(reg, 0) # if false do nothing asm.cmp(reg, 0) # if false do nothing
x86_jz(end_label) asm.jz(end_label)
x86_mov(reg, -1) # truthy, make it true asm.mov(reg, -1) # truthy, make it true
emit_label(end_label) asm.emit_label(end_label)
end end
def relation def relation
@ -314,14 +293,14 @@ class Compiler
# and make_boolean will leave -1 (true) for us in eax. # and make_boolean will leave -1 (true) for us in eax.
def neq_relation def neq_relation
expression expression
x86_sub(:eax, '[esp]') asm.sub(:eax, '[esp]')
make_boolean make_boolean
end end
# Invert the != test for equal. # Invert the != test for equal.
def eq_relation def eq_relation
neq_relation neq_relation
x86_not(:eax) asm.not(:eax)
end end
# > and < are both implemented in terms of jl (jump if less than). # > and < are both implemented in terms of jl (jump if less than).
@ -337,20 +316,20 @@ class Compiler
# Invert the sense of the test? # Invert the sense of the test?
invert = options[:invert] invert = options[:invert]
true_label = unique_label(:cmp) true_label = asm.label(:cmp)
end_label = unique_label(:endcmp) end_label = asm.label(:endcmp)
x86_cmp(a, b) asm.cmp(a, b)
x86_jl(true_label) asm.jl(true_label)
x86_xor(:eax, :eax) # return false asm.xor(:eax, :eax) # return false
x86_not(:eax) if invert # (or true if inverted) asm.not(:eax) if invert # (or true if inverted)
x86_jmp(end_label) asm.jmp(end_label)
emit_label(true_label) asm.emit_label(true_label)
x86_xor(:eax, :eax) # return true asm.xor(:eax, :eax) # return true
x86_not(:eax) unless invert # (or false if inverted) asm.not(:eax) unless invert # (or false if inverted)
emit_label(end_label) asm.emit_label(end_label)
end end
# a: [esp] # a: [esp]
@ -401,8 +380,8 @@ class Compiler
name = @value name = @value
match('=') match('=')
boolean_expression boolean_expression
defvar(name) unless var?(name) asm.defvar(name) unless asm.var?(name)
x86_mov("dword [#{name}]", :eax) asm.mov("dword [#{name}]", :eax)
end end
# Parse a code block. # Parse a code block.
@ -439,26 +418,26 @@ class Compiler
# Parse an if-else statement. # Parse an if-else statement.
def if_else_stmt(label) def if_else_stmt(label)
else_label = unique_label(:end_or_else) else_label = asm.label(:end_or_else)
end_label = else_label # only generated if else clause end_label = else_label # only generated if else clause
# present # present
condition condition
skip_any_whitespace skip_any_whitespace
x86_jz(else_label) asm.jz(else_label)
@indent += 1 @indent += 1
block(label) block(label)
@indent -= 1 @indent -= 1
if @token == :keyword && @value == 'else' if @token == :keyword && @value == 'else'
skip_any_whitespace skip_any_whitespace
end_label = unique_label(:endif) # now we need the 2nd label end_label = asm.label(:endif) # now we need the 2nd label
x86_jmp(end_label) asm.jmp(end_label)
emit_label(else_label) asm.emit_label(else_label)
@indent += 1 @indent += 1
block(label) block(label)
@indent -= 1 @indent -= 1
end end
match_word('end') match_word('end')
emit_label(end_label) asm.emit_label(end_label)
end end
# Used to implement the Two-Label-Loops (while, until, repeat). # Used to implement the Two-Label-Loops (while, until, repeat).
@ -467,9 +446,9 @@ class Compiler
# block: Code to execute at the start of each iteration. (e.g. a # block: Code to execute at the start of each iteration. (e.g. a
# condition) # condition)
def simple_loop(name) def simple_loop(name)
start_label = unique_label(:"loop_#{name}") start_label = asm.label(:"loop_#{name}")
end_label = unique_label(:"end_#{name}") end_label = asm.label(:"end_#{name}")
emit_label(start_label) asm.emit_label(start_label)
yield(end_label) yield(end_label)
@ -477,15 +456,15 @@ class Compiler
block(end_label) block(end_label)
@indent -= 1 @indent -= 1
match_word('end') match_word('end')
x86_jmp(start_label) asm.jmp(start_label)
emit_label(end_label) asm.emit_label(end_label)
end end
def while_stmt def while_stmt
simple_loop('while') do |end_label| simple_loop('while') do |end_label|
condition condition
skip_any_whitespace skip_any_whitespace
x86_jz(end_label) asm.jz(end_label)
end end
end end
@ -493,7 +472,7 @@ class Compiler
simple_loop('until') do |end_label| simple_loop('until') do |end_label|
condition condition
skip_any_whitespace skip_any_whitespace
x86_jnz(end_label) asm.jnz(end_label)
end end
end end
@ -511,24 +490,24 @@ class Compiler
counter = "[#{get_name}]" counter = "[#{get_name}]"
match('=') match('=')
boolean_expression # initial value boolean_expression # initial value
x86_sub(:eax, 1) # pre-decrement because of the asm.sub(:eax, 1) # pre-decrement because of the
# following pre-increment # following pre-increment
x86_mov(counter, :eax) # stash the counter in memory asm.mov(counter, :eax) # stash the counter in memory
match_word('to', :scan => true) match_word('to', :scan => true)
boolean_expression # final value boolean_expression # final value
skip_any_whitespace skip_any_whitespace
x86_push(:eax) # stash final value on stack asm.push(:eax) # stash final value on stack
final = '[esp]' final = '[esp]'
simple_loop('for') do |end_label| simple_loop('for') do |end_label|
x86_mov(:ecx, counter) # get the counter asm.mov(:ecx, counter) # get the counter
x86_add(:ecx, 1) # increment asm.add(:ecx, 1) # increment
x86_mov(counter, :ecx) # store the counter asm.mov(counter, :ecx) # store the counter
x86_cmp(final, :ecx) # check if we're done asm.cmp(final, :ecx) # check if we're done
x86_jz(end_label) # if so jump to the end asm.jz(end_label) # if so jump to the end
end end
x86_add(:esp, 4) # clean up the stack asm.add(:esp, 4) # clean up the stack
end end
# do 5 # do 5
@ -538,39 +517,38 @@ class Compiler
boolean_expression boolean_expression
skip_any_whitespace skip_any_whitespace
x86_mov(:ecx, :eax) asm.mov(:ecx, :eax)
x86_push(:ecx)
start_label = unique_label(:do) start_label = asm.label(:do)
end_label = unique_label(:enddo) end_label = asm.label(:enddo)
emit_label(start_label) asm.emit_label(start_label)
x86_push(:ecx) asm.push(:ecx)
@indent += 1 @indent += 1
block(end_label) block(end_label)
@indent -= 1 @indent -= 1
x86_pop(:ecx) asm.pop(:ecx)
match_word('end') match_word('end')
x86_loop(start_label) asm.loop_(start_label)
# Phony push! break needs to clean up the stack, but since we # Phony push! break needs to clean up the stack, but since we
# don't know if there is a break at this point we fake a push and # don't know if there is a break at this point we fake a push and
# always clean up the stack after. # always clean up the stack after.
x86_sub(:esp, 4) asm.sub(:esp, 4)
emit_label(end_label) asm.emit_label(end_label)
# If there was a break we have to clean up the stack here. If # If there was a break we have to clean up the stack here. If
# there was no break we clean up the phony push above. # there was no break we clean up the phony push above.
x86_add(:esp, 4) asm.add(:esp, 4)
end end
def break_stmt(label) def break_stmt(label)
if label if label
x86_jmp(label) asm.jmp(label)
else else
expected(:'break to be somewhere useful', expected(:'break to be somewhere useful',
:got => :'a break outside a loop') :got => :'a break outside a loop')
@ -581,51 +559,57 @@ class Compiler
def condition def condition
boolean_expression boolean_expression
skip_whitespace skip_whitespace
x86_cmp(:eax, 0) # 0 is false, anything else is true asm.cmp(:eax, 0) # 0 is false, anything else is true
end end
# print eax in hex format # print eax in hex format
def print_stmt def print_stmt
# define a lookup table of digits asm.block do
unless var?('DIGITS') # define a lookup table of digits
defvar('DIGITS', 4) unless var?('DIGITS')
x86_mov('dword [DIGITS]', 0x33323130) defvar('DIGITS', 4)
x86_mov('dword [DIGITS+4]', 0x37363534) mov('dword [DIGITS]', 0x33323130)
x86_mov('dword [DIGITS+8]', 0x62613938) mov('dword [DIGITS+4]', 0x37363534)
x86_mov('dword [DIGITS+12]', 0x66656463) mov('dword [DIGITS+8]', 0x62613938)
mov('dword [DIGITS+12]', 0x66656463)
end
# 3 dwords == 12 chars
defvar('HEX', 3) unless var?('HEX')
# TODO check sign and prepend '-' if negative
mov('word [HEX]', 0x7830) # "0x" == [48, 120]
mov('word [HEX+10]', 0xa) # newline + null terminator
end end
# 3 dwords == 12 chars
defvar('HEX', 3) unless var?('HEX')
# TODO check sign and prepend '-' if negative
x86_mov('word [HEX]', 0x7830) # "0x" == [48, 120]
x86_mov('word [HEX+10]', 0xa) # newline + null terminator
boolean_expression boolean_expression
# convert eax to a hex string asm.block do
x86_lea(:esi, '[DIGITS]') # convert eax to a hex string
x86_lea(:edi, '[HEX+9]') lea(:esi, '[DIGITS]')
# build the string backwards (right to left), byte by byte lea(:edi, '[HEX+9]')
x86_mov(:ecx, 4) # build the string backwards (right to left), byte by byte
emit_label(loop_label=unique_label) mov(:ecx, 4)
# low nybble of nth byte end
x86_movzx(:ebx, :al) asm.emit_label(loop_label=asm.label)
x86_and(:bl, 0x0f) # isolate low nybble asm.block do
x86_movzx(:edx, 'byte [esi+ebx]') # low nybble of nth byte
x86_mov('byte [edi]', :dl) movzx(:ebx, :al)
x86_dec(:edi) and_(:bl, 0x0f) # isolate low nybble
# high nybble of nth byte movzx(:edx, 'byte [esi+ebx]')
x86_movzx(:ebx, :al) mov('byte [edi]', :dl)
x86_and(:bl, 0xf0) # isolate high nybble dec(:edi)
x86_shr(:bl, 4) # high nybble of nth byte
x86_mov(:dl, 'byte [esi+ebx]') movzx(:ebx, :al)
x86_mov('byte [edi]', :dl) and_(:bl, 0xf0) # isolate high nybble
x86_dec(:edi) shr(:bl, 4)
x86_shr(:eax, 8) mov(:dl, 'byte [esi+ebx]')
x86_loop(loop_label) mov('byte [edi]', :dl)
x86_mov(:eax, 4) # SYS_write dec(:edi)
x86_mov(:ebx, 1) # STDOUT shr(:eax, 8)
x86_lea(:ecx, '[HEX]') loop_(loop_label)
x86_mov(:edx, 11) # excluding term, max # of chars to print mov(:eax, 4) # SYS_write
x86_int(0x80) mov(:ebx, 1) # STDOUT
lea(:ecx, '[HEX]')
mov(:edx, 11) # excluding term, max # of chars to print
int(0x80)
end
end end
@ -802,67 +786,7 @@ class Compiler
get_char while any_whitespace?(@look) get_char while any_whitespace?(@look)
end end
# Define a constant in the .data section.
def equ(name, value)
@data << "#{name}\tequ #{value}"
end
# Define a variable with the given name and size (in dwords).
def defvar(name, dwords=1)
unless var?(name)
@bss << "#{name}: resd #{dwords}\n"
@vars[name] = @bss_size
@bss_size += dwords
else
STDERR.puts "[warning] attempted to redefine #{name}"
end
end
def var?(name)
@vars[name]
end
def var(name)
@vars[name]
end
# Emit a line of code wrapped between a tab and a newline. Required
# by Assembler::Text.
def emit(code, options={})
tab = options.has_key?(:tab) ? options[:tab] : "\t"
@code << "#{tab}#{code}\n"
end
# emit_byte and bytes_written are required by Assembler::Binary.
def emit_byte(byte)
@binary << byte
end
def bytes_written
@binary.size
end
def emit_label(name=unique_label)
emit("#{name}:", :tab => nil)
@labels[name] = @binary.length
end
def resolve_label(label)
@labels[label]
end
# Generate a unique label.
def unique_label(suffix=nil)
@num_labels += 1
if suffix
@num_labels_with_suffix[suffix] += 1
suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
end
"L#{sprintf "%06d", @num_labels}#{suffix}"
end
def indent def indent
real_indent = if @value == 'else' || @value == 'end' real_indent = if @value == 'else' || @value == 'end'
@indent - 1 @indent - 1
@ -872,16 +796,10 @@ class Compiler
' ' * (real_indent * 4) ' ' * (real_indent * 4)
end end
# Pack the array into a byte string.
def binary
@binary.pack('c*')
end
def pushing(reg) def pushing(reg)
x86_push(reg) asm.push(reg)
yield yield
x86_add(:esp, 4) asm.add(:esp, 4)
end end
def op(name) def op(name)

View file

@ -1,3 +0,0 @@
mov ebx, eax
mov eax, 1
int 0x80

6
min.darwin.asm Normal file
View file

@ -0,0 +1,6 @@
BITS 32
SECTION .text
GLOBAL _main
_main:
mov eax, 0
ret

View file

@ -1,4 +0,0 @@
BITS 32
mov ebx,eax
mov eax,1
int 0x80

View file

@ -1,25 +0,0 @@
class OpCode
Attrs = [:prefix, :op, :modrm, :sib, :extra]
attr_accessor *Attrs
def initialize(attrs)
Attrs.each do |attr|
send("#{attr}=", attrs[attr])
end
end
def size
Attrs.inject(0) {|sum, attr|
iv = instance_variable_get("@#{attr}")
if iv.is_a?(Enumerable)
sum + iv.size
else
sum + 1
end
}
end
def binary
Attrs.map {|attr| send(attr)}.flatten.pack('c*')
end
end

View file

@ -1,4 +0,0 @@
BITS 32
GLOBAL _start
SECTION .text
_start:

11
template.darwin.asm Normal file
View file

@ -0,0 +1,11 @@
BITS 32
GLOBAL _main
SECTION .data
{data}
SECTION .bss
{bss}
SECTION .text
_main:
{code}
;; The result in eax is the exit code, just return.
ret

47
test.rb
View file

@ -1,47 +0,0 @@
require 'compiler'
require 'stringio'
X86_exit = [0x89, 0xc3, # mov ebx, eax (exit code)
0xb8, 1, 0, 0, 0, # mov eax, 1
0xcd, 0x80 # int 0x80
].pack('c*')
def error(msg) STDERR.puts(msg) end
def parse(input)
compiler = Compiler.new(input)
compiler.parse # tuple of [data, bss, code, binary]
rescue ParseError => e
error("[error] #{e.message}")
error("[context] #{e.context}")
# error("Aborting!")
error(e.caller)
exit(1)
end
def interpolate(template, data)
data.inject(template) do |template, mapping|
token, replacement = *mapping
template.sub("{#{token}}", replacement)
end
end
def main(arg)
input = if File.readable?(arg)
File.open(arg)
else
# StringIO.new("5*(3-5)*2+2-9/3-8/2-4*(5+5+5)\n")
StringIO.new("abc=999\nabc-888\n")
end
data, bss, code, binary = *parse(input)
template = File.read("template.asm")
asm = interpolate(template, :data => data, :bss => bss, :code => code)
File.open("test.asm", "w") { |f| f.puts(asm) }
File.open("test.bin", "wb") { |f|
f.write(binary)
f.write(X86_exit)
}
end
main(ARGV[0].to_s)

View file

@ -1,50 +1,63 @@
PLATFORM=$(shell uname -s)
BINFORMAT="bin"
ifeq ($(PLATFORM), Darwin)
BINFORMAT="macho"
endif
ifeq ($(PLATFORM), Linux)
BINFORMAT="elf"
endif
ifeq ($(BINFORMAT), bin)
@echo "WARNING: binary format is 'bin', this is probably not what you want!"
@echo " Your platform, " $(PLATFORM) ", is unsupported."
endif
all: lt gt ge le eq neq if while until repeat for do break print all: lt gt ge le eq neq if while until repeat for do break print
@echo -n @true
lt: test.rb test_lt.code lt: test.rb test_lt.code
@./test.rb lt @./test.rb lt $(BINFORMAT)
gt: test.rb test_gt.code gt: test.rb test_gt.code
@./test.rb gt @./test.rb gt $(BINFORMAT)
ge: test.rb test_ge.code ge: test.rb test_ge.code
@./test.rb ge @./test.rb ge $(BINFORMAT)
le: test.rb test_le.code le: test.rb test_le.code
@./test.rb le @./test.rb le $(BINFORMAT)
eq: test.rb test_eq.code eq: test.rb test_eq.code
@./test.rb eq @./test.rb eq $(BINFORMAT)
neq: test.rb test_neq.code neq: test.rb test_neq.code
@./test.rb neq @./test.rb neq $(BINFORMAT)
if: test.rb test_if.code if: test.rb test_if.code
@./test.rb if @./test.rb if $(BINFORMAT)
while: test.rb test_while.code while: test.rb test_while.code
@./test.rb while @./test.rb while $(BINFORMAT)
until: test.rb test_until.code until: test.rb test_until.code
@./test.rb until @./test.rb until $(BINFORMAT)
repeat: test.rb test_repeat.code repeat: test.rb test_repeat.code
@./test.rb repeat @./test.rb repeat $(BINFORMAT)
for: test.rb test_for.code for: test.rb test_for.code
@./test.rb for @./test.rb for $(BINFORMAT)
do: test.rb test_do.code do: test.rb test_do.code
@./test.rb do @./test.rb do $(BINFORMAT)
break: test.rb test_break.code break: test.rb test_break.code
@./test.rb break @./test.rb break $(BINFORMAT)
print: test.rb test_print.code print: test.rb test_print.code
@./test.rb print @./test.rb print $(BINFORMAT)
big_test: test.rb big_test.code big_test: test.rb big_test.code
@./test.rb big @./test.rb big $(BINFORMAT)
clean: clean:
@rm -f test*.asm test*.o @rm -f test*.asm test*.o

View file

@ -5,12 +5,21 @@ $LOAD_PATH << ROOT
require 'build' require 'build'
# usage: build.rb <func> [binformat]
#
# ([format] will go before [binformat])
def main def main
func = ARGV[0].to_s func = ARGV[0].to_s
format = 'asm' # 'bin' only assembles one or two
# instructions right now, but support
# is in place
binformat = (ARGV[1] ? ARGV[1] : 'elf').downcase
platform = `uname -s`.chomp.downcase
print "testing #{func} ... " print "testing #{func} ... "
success = run( build("test_#{func}.code") ) success = run( build("test_#{func}.code", platform, format, binformat) )
puts success == 0 ? "pass" : "FAIL! (#{success})" puts success == 0 ? "pass" : "FAIL! (#{success})"
exit(success) exit(success.to_i)
end end
main if $0 == __FILE__ main if $0 == __FILE__