WIP: re-organize into lib/ dir

This commit is contained in:
Sami Samhuri 2026-06-18 06:42:12 -07:00
parent 5da06f938c
commit a12bdafde4
No known key found for this signature in database
34 changed files with 4280 additions and 0 deletions

7
bin/compile Executable file
View file

@ -0,0 +1,7 @@
#!/usr/bin/env ruby
$LOAD_PATH.unshift('../lib')
require 'compiler'
???

116
lib/compiler.rb Normal file
View file

@ -0,0 +1,116 @@
this_dir = File.dirname(__FILE__)
Dir.chdir(File.expand_path('..', this_dir))
$LOAD_PATH.unshift(this_dir) unless $LOAD_PATH.include?(this_dir)
require 'compiler/parser'
class Compiler
attr_reader :platform, :arch_name, :format, :binformat
attr_reader :arch, :asm, :symbol_table_factory, :object_file_factor
# platform [String] "linux" or "darwin"
# arch_name [String] "x86" or "arm"
# format [String] "text" or "bin"
# binform [String, nil] "elf" or "macho", only used when format is "bin"
def initialize(platform, arch_name, format, binformat = nil)
@platform = platform
@arch_name = arch_name
@format = format
@binformat = binformat
wire
end
def compile(input)
parser = Parser.new(input, asm)
parser.parse
parser.compile
end
#######
private
#######
def wire
if format == 'bin'
case binformat
when 'elf'
wire_elf
when 'macho'
wire_macho
else
raise "unsupported binary format: #{binformat}"
end
end
case @arch_name
when 'x86'
wire_x86
when 'arm'
wire_arm
else
raise "unsupported arch: #{arch}"
end
end
def wire_elf
require 'compiler/asm/elf/object_file'
require 'compiler/asm/elf/symbol_table'
@symbol_table_factory = ASM::ELF::SymbolTable
@object_file_factory = ASM::ELF::ObjectFile
end
def wire_macho
require 'compiler/asm/macho/object_file'
require 'compiler/asm/macho/symbol_table'
@symbol_table_factory = ASM::MachO::SymbolTable
@object_file_factory = ASM::MachO::ObjectFile
end
def wire_arm
require 'compiler/asm/arm/binary_assembler'
require 'compiler/asm/arm/text_assembler'
@arch = ASM::ARM::Arch.instance
@asm =
case format
when 'text'
ASM::ARM::TextAssembler.new(self)
when 'bin'
ASM::ARM::BinaryAssembler.new(self)
else
raise "unsupported output format: #{format}"
end
end
def wire_x86
require 'compiler/asm/x86/binary_assembler'
require 'compiler/asm/x86/text_assembler'
@arch = ASM::X86::Arch.instance
@asm =
case format
when 'text'
ASM::X86::TextAssembler.new(self)
when 'bin'
ASM::X86::BinaryAssembler.new(self)
else
raise "unsupported output format: #{format}"
end
end
end

67
lib/compiler/asm/arch.rb Normal file
View file

@ -0,0 +1,67 @@
class Compiler
module ASM
class Arch
attr_reader :bits, :word_bits
attr_reader :preamble, :postamble
attr_reader :endianness
# config:
# - bits: native register / pointer size
# - word_bits: number of bits in a word
# - endianness: "big" or "little"
# - preamble: binary preamble
# - postamble: binary postamble
def initialize(config)
@bits = config['bits']
@word_bits = config['word_bits']
@endianness = config['endianness']
@preamble = config['preamble']
@postamble = config['postamble']
end
def bytes
bits / 8
end
def word_bytes
word_bits / 8
end
def big_endian?
endianness == 'big'
end
def little_endian?
endianness == 'little'
end
def pointer_bytes
bytes
end
def min_signed
-1 * 2 ** (bits - 1)
end
def max_signed
2 ** (bits - 1) - 1
end
def min_unsigned
0
end
def max_unsigned
2 ** bits - 1
end
def signed_int
@signed_int ||= min_signed..max_signed
end
end
end
end

View file

@ -0,0 +1,82 @@
# sjs
# may 2009
class Compiler
module ASM
# Abstract class for common functionality between different code
# generators. Also defines somewhat of an interface that must be
# implemented to be useful.
class Assembler
def initialize(delegate)
@delegate = delegate
end
def arch
delegate.arch
end
def block(*args, &block)
instance_eval(&block)
end
def load(n)
end
def load_var(name)
end
def store_var(name, reg)
end
def neg(reg)
end
def stack_add(reg)
end
def stack_sub(reg)
end
def stack_mul_signed(reg)
end
def stack_div(reg)
end
def stack_or(reg)
end
def stack_xor(reg)
end
def stack_and(reg)
end
def not_(reg)
end
alias_method :not, :not_
def compare(reg, n)
end
def je(label)
end
def jne(label)
end
def jmp(label)
end
def mov_reg_imm(reg, n)
end
def call(label)
end
end
end
end

View file

@ -0,0 +1,322 @@
require 'compiler/asm/assembler'
require 'compiler/asm/constant_proxy'
require 'compiler/asm/variable_proxy'
class Compiler
module ASM
class BinaryAssembler < Assembler
DEBUG_OUTPUT = false
attr_reader :ip
def initialize(delegate)
super(delegate)
@symtab = delegate.symbol_table_factory.new
# Almost a byte array, except for addresses.
#
# Addresses take the form [:<type>, <name>]
# where <type> is one of: var, const, or label
#
# NOTE the type is redundant because of VariableProxy#const?
# and labels are just strings.
#
# however, we could accept strings for variable names
# if we keep the type tag. something to think about.
@ir = []
# Our instruction pointer, or the number of bytes written.
@ip = 0
# Map locations in the byte array to var proxies so we can
# resolve address operations on the 2nd pass.
@proxies = {}
emit_entry_point
emit_preamble
end
# register for return values
def return_reg
raise 'subclasses must override #return_reg'
end
def emit_entry_point
end
def emit_preamble
arch.preamble[delegate.platform].each { |byte| emit_byte(byte) }
end
def emit_postamble
arch.postamble[delegate.platform].each { |byte| emit_byte(byte) }
end
def output
emit_postamble
byte_array = resolve_labels
#puts "1st pass: " + byte_array.inspect if DEBUG_OUTPUT
binary = package(byte_array)
@symtab.calculate_offsets(binary.length)
if DEBUG_OUTPUT
puts ">>> text offset: 0x#{@symtab.text_offset.to_s(16)}"
puts ">>> const offset: 0x#{@symtab.const_offset.to_s(16)}"
puts ">>> bss offset: 0x#{@symtab.bss_offset.to_s(16)}"
end
# Now that we know where everything lies do the 2nd pass
# calculating and filling in final var and const addresses.
#
# outline:
# - resolve all variable proxies in @proxies replacing
# the placeholder bytes (0xff) with the real address
bss_offset = @symtab.bss_offset
const_offset = @symtab.const_offset
@proxies.each do |i, proxy|
#puts ">>> Resolving #{proxy.name}" if DEBUG_OUTPUT
var = @symtab.var(proxy.name)
base_addr = if proxy.const?
const_offset + @symtab.const(proxy.name)
else
bss_offset + @symtab.var(proxy.name)
end
#puts ">>> Replacing #{byte_array[i,4].map{|x|'0x' + x.to_s(16)}.inspect} with #{num_to_quad(proxy.resolve(base_addr)).map{|x|'0x' + x.to_s(16)}.inspect}" if DEBUG_OUTPUT
byte_array[i, arch.pointer_size] = num_to_quad(proxy.resolve(base_addr))
end
binary = package(byte_array)
#puts "2nd pass: " + byte_array.inspect if DEBUG_OUTPUT
objwriter = delegate.object_file_factory.new
objwriter.text(binary)
objwriter.const(@symtab.const_data) if @symtab.const_size > 0
objwriter.bss(@symtab.bss_size) if @symtab.bss_size > 0
objwriter.reloc(@symtab.reloc_info)
objwriter.symtab(@symtab)
objwriter.serialize
end
def resolve_labels
bytes_read = 0
bytes = []
@ir.each_with_index do |x, i|
if x.is_a?(Numeric)
bytes << x
bytes_read += 1
elsif addr?(x)
# remember this so we can replace the bogus addr later
@proxies[bytes_read] = x[1]
# add a relocation entry for this address
@symtab.reloc(bytes_read)
# fill in said bogus addr
bogus_addr = [0xff] * arch.pointer_size
bytes += bogus_addr
bytes_read += bogus_addr.length
# TODO find out if we should calculate addrs as offsets rather than
# absolute as they are done now. (ok for Mach-O, maybe not ELF)
elsif label?(x)
# the actual eip points to the next instruction already, so should we.
real_ip = bytes_read + arch.bytes
name = x[1]
addr = @symtab.lookup_label(name) - real_ip # dest - src to get relative addr
#puts "resolved label: #{x} = 0x#{@symtab.lookup_label(name).to_s(16)} (rel: 0x#{addr.to_s(16)}, ip = 0x#{real_ip.to_s(16)}, bytes_read = 0x#{bytes_read.to_s(16)})" if DEBUG_OUTPUT
addr_bytes = addr_to_bytes(addr)
bytes += addr_bytes
bytes_read += addr_bytes.length
else
raise "unknown value in the IR at #{bytes_read} - #{x.inspect}"
end
end
return bytes
end
def package(bytes)
bytes.pack('c*')
end
# Silly semantics, but labels don't count as an address since they
# don't need to be deferred.
def addr?(x)
x.is_a?(Array) && [:var, :const].include?(x[0])
end
def label?(x)
x.is_a?(Array) && x[0] == :label
end
# XXX this should probably evaluate the value somehow
def define_const(name, bytes, value)
@symtab.define_const(name, bytes, value)
return const(name)
end
# Define a variable with the given name and size in bytes.
def define_var(name, bytes = arch.word_bytes)
unless @symtab.var?(name)
@symtab.define_var(name, bytes)
else
STDERR.puts "[warning] attempted to redefine #{name}"
end
return var(name)
end
def var(name)
STDERR.puts "[error] undefined variable #{name}" unless var?(name)
VariableProxy.new(name)
end
def const(name)
STDERR.puts "[error] undefined constant #{name}" unless const?(name)
ConstantProxy.new(name)
end
def var?(name)
@symtab.var?(name)
end
def const?(name)
@symtab.const?(name)
end
# Define a variable unless it exists.
def var!(name, bytes = arch.word_bytes)
if var?(name)
var(name)
else
define_var(name, bytes)
end
end
# Count the bytes that were encoded in the given block.
def asm
# stash the current number of bytes written
instruction_offset = @ip
print "0x#{@ip.to_s(16).rjust(4, '0')}\t" if DEBUG_OUTPUT
yield
# return the number of bytes written
@ip - instruction_offset
puts if DEBUG_OUTPUT
end
def emit_byte(byte)
##### The joke's on me! Array#pack('c*') already does this. It is nice to see
# in the debugging output though, so it stays for now.
#
# Convert negative native ints into signed bytes.
#
# Calculate the signed byte as the difference between -1 (0xff) and some
# number, X. When byte == -1 we want X == 0, so X == -byte - 1.
# Since -byte == ~byte + 1, then -byte - 1 == ~byte + 1 - 1 == ~byte,
# and X == ~byte. We want the *signed byte* -1, so we use 0xff,
# *not* -1. Ruby sees our signed bytes as positive ints 0-255.
#
byte = 0xff - ~byte if byte < 0 && byte >= -128
# make sure it's a byte
raise "not a byte: #{byte.inspect}" unless byte == byte & 0xff
byte = byte & 0xff
### end of pointless code
print (byte >= 0 && byte < 0x10 ? '0' : '') + byte.to_s(16) + ' ' if DEBUG_OUTPUT
@ir << byte
@ip += 1
end
# addresses are emited as arrays of bytes, prefixed with :var, :const, or :label
def emit_addr(type, name)
placeholder = [type, name]
puts placeholder.inspect if DEBUG_OUTPUT
@ir << placeholder
# addresses are a constant size
@ip += arch.pointer_bytes
end
def emit_var(name_or_proxy)
proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : var(name_or_proxy)
emit_addr(:var, proxy)
end
def emit_const(name)
proxy = name_or_proxy.is_a?(VariableProxy) ? name_or_proxy : const(name_or_proxy)
emit_addr(:const, proxy)
end
def emit_label(name)
print "<#{name}> " if DEBUG_OUTPUT
emit_addr(:label, name)
end
def make_label(suffix = nil)
@symtab.unique_label(suffix)
end
def define_label(name)
puts "\n#{name} (0x#{@ip.to_s(16)}):" if DEBUG_OUTPUT
@symtab.define_label(name, @ip)
end
def addr_to_bytes
if big_endian?
num_to_big_endian
elsif little_endian?
num_to_little_endian
else
raise 'oops'
end
end
# Convert a number to an array of bytes, discarding excess bits.
def num_to_big_endian(num)
case arch.pointer_size
when 4
[
# high
(num >> 16) & 0xff,
(num >> 24) & 0xff,
# low
num & 0xff,
(num >> 8) & 0xff
]
else
raise 'unimplemented'
end
end
# Convert a number to an array of bytes, discarding excess bits.
def num_to_little_endian(num)
bytes = num_to_big_endian
bytes.each_slice(2).to_a.reverse.flatten
end
end
end
end

View file

@ -0,0 +1,13 @@
class Compiler
module ASM
class ConstantProxy < VariableProxy
def const?
true
end
end
end
end

342
lib/compiler/asm/cstruct.rb Normal file
View file

@ -0,0 +1,342 @@
# Struct does some trickery with custom allocators so we can't
# subclass it without writing C. Instead we define a CStruct class
# that does something similar enough for our purpose. It is
# subclassed just like any other class. A nice side-effect of this
# syntax is that it is always clear that a CStruct is just a class and
# instances of the struct are objects.
#
# Some light metaprogramming is used to make the following syntax possible:
#
# class MachHeader < CStruct
# uint :magic
# int :cputype
# int :cpusubtype
# ...
# int :flags
# end
#
# Inheritance works as you would expect.
#
# class LoadCommand < CStruct
# uint32 :cmd
# uint32 :cmdsize
# end
#
# # inherits cmd and cmdsize as the first 2 fields
# class SegmentCommand < LoadCommand
# string :segname, 16
# uint32 :vmaddr
# uint32
# end
#
# Nothing tricky or confusing there. Members of a CStruct class are
# declared in the class definition. A different definition using a
# more static approach probably wouldn't be very hard... if
# performance is critical ... but then why are you using Ruby? ;-)
#
#
# TODO support bit fields
#
# Bit fields should be supported by passing the number of bits a field
# should occupy. Perhaps we could use the size 'pack' for the rest of
# the field.
#
# class RelocationInfo < CStruct
# int32 :address
# uint32 :symbolnum, 24
# pack :pcrel, 1
# pack :length, 2
# pack :extern, 1
# pack :type, 4
# end
class CStruct
###################
# Class Constants #
###################
# Size in bytes.
SIZE_MAP = {
:int8 => 1,
:uint8 => 1,
:int16 => 2,
:uint16 => 2,
:int32 => 4,
:uint32 => 4,
:string => lambda { |*opts| opts.first }, # first opt is size
# the last 3 are to make the language more C-like
:int => 4,
:uint => 4,
:char => 1
}
# 32-bit
PACK_MAP = {
:int8 => 'c',
:uint8 => 'C',
:int16 => 's',
:uint16 => 'S',
:int32 => 'i',
:uint32 => 'I',
:string => lambda do |str, *opts|
len = opts.first
str.ljust(len, "\0")[0, len]
end,
# a few C-like names
:int => 'i',
:uint => 'I',
:char => 'C'
}
# Only needed when unpacking is different from packing, i.e. strings w/ lambdas in PACK_MAP.
UNPACK_MAP = {
:string => lambda do |str, *opts|
len = opts.first
val = str[0, len-1].sub(/\0*$/, '')
str.slice!((len-1)..-1)
val
end
}
##########################
# Class Instance Methods #
##########################
# Note: const_get and const_set are used so the constants are bound
# at runtime, to the real class that has subclassed CStruct.
# I figured Ruby would do this but I haven't looked at the
# implementation of constants so it might be tricky.
#
# All of this could probably be avoided with Ruby 1.9 and
# private class variables. That is definitely something to
# experiment with.
class <<self
def inherited(subclass)
subclass.instance_eval do
# These "constants" are only constant references. Structs can
# be modified. After the struct is defined it is still open,
# but good practice would be not to change a struct after it
# has been defined.
#
# To support inheritance properly we try to get these
# constants from the enclosing scope (and clone them before
# modifying them!), and default to empty, er, defaults.
members = const_get(:Members).clone rescue []
member_index = const_get(:MemberIndex).clone rescue {}
member_sizes = const_get(:MemberSizes).clone rescue {}
member_opts = const_get(:MemberOptions).clone rescue {}
const_set(:Members, members)
const_set(:MemberIndex, member_index)
const_set(:MemberSizes, member_sizes)
const_set(:MemberOptions, member_opts)
end
end
# Define a method for each size name, and when that method is called it updates
# the struct class accordingly.
SIZE_MAP.keys.each do |type|
define_method(type) do |name, *args|
name = name.to_sym
const_get(:MemberIndex)[name] = const_get(:Members).size
const_get(:MemberSizes)[name] = type
const_get(:MemberOptions)[name] = args
const_get(:Members) << name
end
end
# Return the number of members.
def size
const_get(:Members).size
end
alias_method :length, :size
# Return the number of bytes occupied in memory or on disk.
def bytesize
const_get(:Members).inject(0) { |size, name| size + sizeof(name) }
end
def sizeof(name)
value = SIZE_MAP[const_get(:MemberSizes)[name]]
value.respond_to?(:call) ? value.call(*const_get(:MemberOptions)[name]) : value
end
def new_from_bin(bin)
new_struct = new
new_struct.unserialize(bin)
end
end
####################
# Instance Methods #
####################
attr_reader :values
def initialize(*args)
@values = args
end
def serialize
vals = @values.clone
membs = members.clone
pack_pattern.map do |patt|
name = membs.shift
if patt.is_a?(String)
[vals.shift].pack(patt)
else
patt.call(vals.shift, *member_options[name])
end
end.join
end
def unserialize(bin)
bin = bin.clone
@values = []
membs = members.clone
unpack_pattern.each do |patt|
name = membs.shift
if patt.is_a?(String)
@values += bin.unpack(patt)
bin.slice!(0, sizeof(name))
else
@values << patt.call(bin, *member_options[name])
end
end
self
end
def pack_pattern
members.map { |name| PACK_MAP[member_sizes[name]] }
end
def unpack_pattern
members.map { |name| UNPACK_MAP[member_sizes[name]] || PACK_MAP[member_sizes[name]] }
end
def [](name_or_idx)
case name_or_idx
when Numeric
idx = name_or_idx
@values[idx]
when String, Symbol
name = name_or_idx.to_sym
@values[member_index[name]]
else
raise ArgumentError.new("expected name or index, got #{name_or_idx.inspect}")
end
end
def []=(name_or_idx, value)
case name_or_idx
when Numeric
idx = name_or_idx
@values[idx] = value
when String, Symbol
name = name_or_idx.to_sym
@values[member_index[name]] = value
else
raise ArgumentError.new("expected name or index, got #{name_or_idx.inspect}")
end
end
def ==(other)
puts @values.inspect
puts other.values.inspect
other.is_a?(self.class) && other.values == @values
end
# Some of these are just to quack like Ruby's built-in Struct. YAGNI, but can't hurt either.
def each(&block)
@values.each(&block)
end
def each_pair(&block)
members.zip(@values).each(&block)
end
def size
members.size
end
alias_method :length, :size
def sizeof(name)
self.class.sizeof(name)
end
def bytesize
self.class.bytesize
end
alias_method :to_a, :values
# A few convenience methods.
def members
self.class::Members
end
def member_index
self.class::MemberIndex
end
def member_sizes
self.class::MemberSizes
end
def member_options
self.class::MemberOptions
end
# The last expression is returned, so return self instead of junk.
self
end
# a small test
if $0 == __FILE__
class MachHeader < CStruct
uint :magic
int :cputype
int :cpusubtype
string :segname, 16
end
puts MachHeader::Members.inspect
puts MachHeader::MemberIndex.inspect
puts MachHeader::MemberSizes.inspect
puts "# of MachHeader members: " + MachHeader.size.to_s + ", size in bytes: " + MachHeader.bytesize.to_s
mh = MachHeader.new(0xfeedface, 7, 3, "foobar")
%w[magic cputype cpusubtype segname].each do |field|
puts "#{field}(#{MachHeader.sizeof(field.to_sym)}): #{mh[field.to_sym].inspect}"
end
puts mh.pack_pattern.inspect
binstr = mh.serialize
puts "values: " + mh.values.inspect
newmh = MachHeader.new_from_bin(binstr)
puts "new values: " + newmh.values.inspect
newbinstr = newmh.serialize
puts "serialized: " + binstr.inspect
puts "unserialized: " + newbinstr.inspect
puts "new == old ? " + (newbinstr == binstr).to_s
end

10
lib/compiler/asm/elf.rb Normal file
View file

@ -0,0 +1,10 @@
require 'compiler/asm/elf/structs'
class Compiler
module ASM
module ELF
end
end
end

View file

@ -0,0 +1,7 @@
module ASM
class ELFSymbolTable < SymbolTable
end
end

View file

@ -0,0 +1,9 @@
module ASM
class ELFWriter < ObjWriter
end
end

10
lib/compiler/asm/macho.rb Normal file
View file

@ -0,0 +1,10 @@
require 'compiler/asm/macho/structs'
class Compiler
module ASM
module MachO
end
end
end

View file

@ -0,0 +1,61 @@
require 'compiler/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
class LoadCommand < CStruct
uint32 :cmd
uint32 :cmdsize
end
# Values for the cmd member of LoadCommand CStructs (incomplete!).
LC_SEGMENT = 0x1
LC_SYMTAB = 0x2
LC_SYMSEG = 0x3
LC_THREAD = 0x4
LC_UNIXTHREAD = 0x5
class SegmentCommand < LoadCommand
string :segname, 16
uint32 :vmaddr
uint32 :vmsize
uint32 :fileoff
uint32 :filesize
int32 :maxprot
int32 :initprot
uint32 :nsects
uint32 :flags
end
# Values for protection fields, maxprot and initprot.
VM_PROT_NONE = 0x00
VM_PROT_READ = 0x01
VM_PROT_WRITE = 0x02
VM_PROT_EXECUTE = 0x04
VM_PROT_NO_CHANGE = 0x08
VM_PROT_COPY = 0x10
class SymbolTableCommand < LoadCommand
uint32 :symoff # Points to an array of Nlist structs.
uint32 :nsyms # Number of entries in said array.
uint32 :stroff # Offset of the string table.
uint32 :strsize # Size of the string table in bytes.
end
LOAD_COMMAND_STRUCT_MAP = {
LC_SEGMENT => SegmentCommand,
LC_SYMTAB => SymbolTableCommand
}
end
end

View file

@ -0,0 +1,46 @@
require 'compiler/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
# Appears at the beginning of every Mach object file.
class MachHeader < CStruct
uint32 :magic
int32 :cputype
int32 :cpusubtype
uint32 :filetype
uint32 :ncmds
uint32 :sizeofcmds
uint32 :flags
end
# Values for the magic field.
MH_MAGIC = 0xfeedface # Mach magic number (big-endian).
MH_CIGAM = 0xcefaedfe # Little-endian version.
# Values for the filetype field.
MH_OBJECT = 0x1
MH_EXECUTE = 0x2
MH_FVMLIB = 0x3
MH_CORE = 0x4
MH_PRELOAD = 0x5
MH_DYLIB = 0x6
MH_DYLINKER = 0x7
MH_BUNDLE = 0x8
MH_DYLIB_STUB = 0x9
MH_DSYM = 0xa
# CPU types and subtypes (only Intel for now).
CPU_TYPE_X86 = 7
CPU_TYPE_I386 = CPU_TYPE_X86
CPU_SUBTYPE_X86_ALL = 3
end
end

View file

@ -0,0 +1,50 @@
require 'compiler/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
########################
# Symbol table support #
########################
# Nlist is used to describe symbols.
class Nlist < CStruct
uint32 :n_strx # Index into string table. Index of zero is the empty string.
uint8 :n_type # Type flag (see below).
uint8 :n_sect # Section number (from 1) or NO_SECT.
uint16 :n_desc # TODO See <mach-o/stab.h>.
uint32 :n_value # The symbol's value (or stab offset).
end
# Type flag (see <mach-o/nlist.h> for more details)
# ---------
#
# This field consists of four bitfields:
#
# uchar N_STAB : 3
# uchar N_PEXT : 1
# uchar N_TYPE : 3
# uchar N_EXT : 1
#
N_STAB = 0xe0 # if any bits set => symbolic debugging info
N_PEXT = 0x10 # private external symbol bit
N_TYPE = 0x0e # mask for the type bits
N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals)
# Values for N_TYPE. (incomplete!)
N_UNDF = 0x0 # undefined, n_sect == NO_SECT
N_ABS = 0x2 # absolute, n_sect == NO_SECT
N_SECT = 0xe # defined in section number n_sect
NO_SECT = 0
MAX_SECT = 255
end
end

View file

@ -0,0 +1,373 @@
require 'asm/macho'
class Compiler
module MachO
class ObjectFile
attr_accessor :header, :load_commands, :sections, :data
attr_accessor :current_segment
def initialize(filetype = MH_OBJECT)
@header = MachHeader.new(MH_MAGIC, CPU_TYPE_X86, CPU_SUBTYPE_X86_ALL, filetype, 0, 0, 0)
@load_commands = [] # All defined segments.
@sections = {} # Map of segment names to lists of sections.
@section_disk_size = Hash.new(0) # Sections store their VM size so we need their sizes on disk.
@section_offset = 0 # Offset of the next section's data, in bytes.
@data = [] # Blobs of data that appear at the end of the file.
# (text, data, relocation info, symtab, ...)
@current_segment = nil # An alias for the last defined segment.
@text_segname = nil # Name of __TEXT segement
@text_sect_index = nil # Index of __text section
@text_data_index = nil # Index into @data of __text section data
@reloc_info = nil # Copy of relocation info array
end
# Define a LoadCommand in this file. The header's ncmds and sizeofcmds
# fields are updated automatically to keep things in sync. If a block is
# given it is passed the new LoadCommand struct after all other
# initialization has been done.
#
# Other methods that create any type of load command should use this
# method to do so. Right now the only types supported are LC_SEGMENT
# and LC_SYMTAB. Modify asm/macho.rb to add structs for other types, and
# add them to LOAD_COMMAND_STRUCT_MAP.
def load_command(cmdtype)
struct = LOAD_COMMAND_STRUCT_MAP[cmdtype]
unless struct
raise "unsupported load command type: #{cmdtype.inspect}," +
" supported types: #{LOAD_COMMAND_STRUCT_MAP.keys.sort.inspect}"
end
# Fill in all the unknown fields with 0, this is nonsense for
# string fields but that doesn't really matter.
dummy_vals = [0] * (struct::Members.size - 2)
# cmd cmdsize ...
command = struct.new(cmdtype, struct.bytesize, *dummy_vals)
@load_commands << command
@header[:ncmds] += 1
@header[:sizeofcmds] += command.bytesize
yield(command) if block_given?
return command
end
# Define a segment in this file. If a block is given it is passed
# the new segment. You can chain calls to segment, it returns self.
#
# Mach object files should only contain one anonymous segment. This
# is not checked but should be kept in mind when crafting files.
def segment(name, &block)
@current_segment = load_command(LC_SEGMENT) do |seg|
seg[:segname] = name
block.call(seg) if block
end
return self
end
# Define a section under the given segment. nsects and cmdsize are
# updated automatically. segname can't be derived from the segment
# that this section is defined under, as they can differ.
#
# Mach object files have the __text, __data, and other common
# sections all defined under one anonymous segment, but their segment
# names reflect their final positions after linking. The linker plonks
# them in the segment that they name.
def section(name, segname, data = '', vmsize=data.size,
segment = @current_segment, type = S_REGULAR)
# Create the new section.
section = Section.new(name, segname, @section_offset, vmsize, 0, 0, 0, 0, 0, 0, type)
# Add this section to the map of segment names to sections.
(@sections[segment[:segname]] ||= []) << section
@section_disk_size[name] = data.size
@section_offset += data.size
@data << data if data.size > 0
# Update the header.
@header[:sizeofcmds] += section.bytesize
# Update the segment.
segment[:nsects] += 1
segment[:cmdsize] += section.bytesize
yield(section) if block_given?
return section
end
# Define a standard text section under the current segment (if present).
#
# If there is no current segment then we act according to the file's type
# (specified in the header). Segments are created if they do not exist.
#
# When it is MH_OBJECT the text section is defined under a single,
# nameless segment, but the section's segment name is set to the name
# given here.
#
# For MH_EXECUTE files the text section goes under the segment with the
# name given (__TEXT).
def text(data, sectname = '__text', segname='__TEXT')
real_segname = nil
unless @current_segment
real_segname = segname_based_on_filetype(segname)
segment(real_segname) do |seg|
seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE
seg[:initprot] = VM_PROT_READ | VM_PROT_EXECUTE
end
end
section(sectname, segname, data) do |sect|
# reloff and nreloc are calculated later (in calculate_offsets)
sect[:flags] = 0x400 # S_ATTR_SOME_INSTRUCTIONS
end
# Remember where section and data are so we can update them later.
@text_segname = real_segname || segname
@text_sect_index = @sections[@text_segname].length-1
@text_data_index = @data.length-1
return self
end
def update_text(data)
raise 'no __text segment defined yet' unless @text_data_index
@data[@text_data_index] = data
end
# Basis for #data, #const, and #bss methods.
def segment_based_on_filetype(segname, options = {})
unless @current_segment
permissions = VM_PROT_READ
permisions |= VM_PROT_WRITE if options.delete(:writable)
segment(segname_based_on_filetype(segname)) do |seg|
seg[:initprot] = seg[:maxprot] = permissions
end
end
yield if block_given?
return self
end
# Define a standard data section under the current segment (if present).
# This behaves similarly to the text method.
#
def data(data, sectname = '__data', segname='__DATA')
segment_based_on_filetype(segname, :writable => true) do
section(sectname, segname, data)
end
end
# Define a standard const section under the current segment (if present).
# This behaves similarly to the data method.
#
def const(data, sectname = '__const', segname='__DATA')
segment_based_on_filetype(segname) do
section(sectname, segname, data)
end
end
# Define a standard BSS section under the current segment (if present).
# This behaves similarly to the data method but accepts a VM size instead
# of a blob, and no data is written to file since this section is for
# uninitialized data.
#
def bss(vmsize, sectname = '__bss', segname='__DATA')
segment_based_on_filetype(segname, :writable => true) do
section(sectname, segname, '', vmsize)
end
end
# Define a relocation table. Usually between segments and the
# symbol table.
#
# Accepts an array of relocation info structs.
def reloc(reloc_info)
@data << if reloc_info.respond_to?(:join)
reloc_info.map {|r| r.serialize}.join
else
reloc_info
end
@reloc_info = reloc_info.map {|x| x.clone}
return self
end
# Define a symbol table. This should usually be placed at the end of the
# file.
#
# This function is overloaded to accept either an array of Nlist structs
# packed into a byte string (i.e. a C array) and a string table, or a
# single parameter: any type of SymbolTable.
def symtab(nlist_ary_or_symtab, stab = nil)
if stab.nil?
symtab = nlist_ary_or_symtab
stab = symtab.stab
nlist_ary = symtab.nlist_ary
else
nlist_ary = nlist_ary_or_symtab
end
load_command(LC_SYMTAB) do |st|
st[:nsyms] = nlist_ary.size
st[:strsize] = stab.size
# symoff and stroff are filled in when offsets are recalculated.
end
# puts ">>> Defining symbol table:"
# puts ">>> #{nlist_ary.size} symbols"
# puts ">>> stab = #{stab.inspect}"
# puts ">>> nlist_ary = #{nlist_ary.inspect}"
# puts ">>> (serialized) = #{nlist_ary.map{|n|n.serialize}.join.inspect}"
@data << nlist_ary.map {|n| n.serialize}.join
@data << stab
return self
end
# Serialize the entire MachO file into a byte string. This is simple
# thanks to CStruct#serialize.
def serialize
# TODO sanity checks, e.g. assert(@header[:ncmds] == @load_command.size)
# ... perhaps an option to recalculate such data as well.
# Now that we have all the pieces of the file defined we can calculate
# the file offsets of segments and sections.
calculate_offsets
###################################
# Mach-O file Part 1: Mach Header #
###################################
@header.serialize +
#####################################
# Mach-O file Part 2: Load Commands #
#####################################
# dump each load command (which include the section headers under them)
@load_commands.map do |cmd|
sects = @sections[cmd[:segname]] rescue []
sects.inject(cmd.serialize) do |data, sect|
data + sect.serialize
end
end.join +
###################################
# Mach-O file Part 3: Binary data #
###################################
@data.join
end
# Update the file offsets in segments and sections.
def calculate_offsets
# Maintain the offset into the the file on disk. This is used
# to update the various structures.
offset = @header.bytesize
# First pass over load commands. Most sizes are filled in here.
@load_commands.each do |cmd|
case cmd[:cmd]
when LC_SEGMENT
seg = cmd
sections = @sections[seg[:segname]]
section_size = sections.size * Section.bytesize
section_vm_size = sections.inject(0) { |total, sect| total + sect[:size] }
section_disk_size = sections.inject(0) do |total, sect|
total + @section_disk_size[sect[:sectname]]
end
### TODO this should be redundant. try commenting it out one day.
seg[:nsects] = sections.size
seg[:cmdsize] = seg.bytesize + section_size
###
seg[:vmsize] = section_vm_size
seg[:filesize] = section_disk_size
when LC_SYMTAB
# nop
else
raise "unsupported load command: #{cmd.inspect}"
end
offset += cmd[:cmdsize]
end
# offset now points to the end of the Mach-O headers, or the beginning
# of the binary blobs of section data at the end.
# Second pass over load commands. Fill in file offsets.
@load_commands.each do |cmd|
case cmd[:cmd]
when LC_SEGMENT
seg = cmd
sections = @sections[seg[:segname]]
seg[:fileoff] = offset
sections.each do |sect|
sect[:offset] = offset
offset += @section_disk_size[sect[:sectname]]
end
when LC_SYMTAB
if @reloc_info
# update text section with relocation info
__text = @sections[@text_segname][@text_sect_index]
__text[:reloff] = offset
__text[:nreloc] = @reloc_info.length
offset += @reloc_info.first.bytesize * @reloc_info.length
end
st = cmd
st[:symoff] = offset
offset += st[:nsyms] * Nlist.bytesize
st[:stroff] = offset
offset += st[:strsize]
# No else clause is necessary, the first iteration should have caught them.
end
end # @load_commands.each
end # def calculate_offsets
#######
private
#######
def segname_based_on_filetype(segname)
case @header[:filetype]
when MH_OBJECT
''
when MH_EXECUTE
segname
else
raise "unsupported MachO file type: #{@header.inspect}"
end
end
end
end
end

View file

@ -0,0 +1,35 @@
require 'compiler/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
class RelocationInfo < CStruct
int32 :r_address # offset in the section to what is being relocated
uint32 :r_info
end
# NOTE: r_info is a packed bit field with the following members:
#
# (CStruct should eventually support bitfields, but doesn't right now.)
#
# r_symbolnum : 24 -- symbol index if r_extern == 1 or section ordinal if r_extern == 0
# r_pcrel : 1 -- was relocated pc relative already
# r_length : 2 -- 0=byte, 1=word, 2=long, 3=quad
# r_extern : 1 -- 1 for exported symbols, 0 othewise
# r_type : 4 -- if not 0, machine specific relocation type (always 0)
R_ABS = 0 # Absolute relocation type
# (r_symbolnum == R_ABS for absolute symbols that don't need reloc)
# Relocation types (r_type)
GENERIC_RELOC_VANILLA = 0
end
end

View file

@ -0,0 +1,34 @@
require 'compiler/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
class Section < CStruct
string :sectname, 16
string :segname, 16
uint32 :addr
uint32 :size
uint32 :offset
uint32 :align
uint32 :reloff
uint32 :nreloc
uint32 :flags
uint32 :reserved1
uint32 :reserved2
end
# Values for the type bitfield (mask 0x000000ff) of the flags field.
# (incomplete!)
S_REGULAR = 0x0
S_ZEROFILL = 0x1
S_CSTRING_LITERALS = 0x2
end
end

View file

@ -0,0 +1,53 @@
require 'compiler/macho/mach_header'
require 'compiler/macho/load_commands'
require 'compiler/macho/section'
require 'compiler/macho/relocation_info'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h
# on Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h>,
# <mach-o/nlist.h>, and <mach-o/reloc.h>.
class Compiler
module MachO
########################
# Symbol table support #
########################
# Nlist is used to describe symbols.
class Nlist < CStruct
uint32 :n_strx # Index into string table. Index of zero is the empty string.
uint8 :n_type # Type flag (see below).
uint8 :n_sect # Section number (from 1) or NO_SECT.
uint16 :n_desc # TODO See <mach-o/stab.h>.
uint32 :n_value # The symbol's value (or stab offset).
end
# Type flag (see <mach-o/nlist.h> for more details)
# ---------
#
# This field consists of four bitfields:
#
# uchar N_STAB : 3
# uchar N_PEXT : 1
# uchar N_TYPE : 3
# uchar N_EXT : 1
#
N_STAB = 0xe0 # if any bits set => symbolic debugging info
N_PEXT = 0x10 # private external symbol bit
N_TYPE = 0x0e # mask for the type bits
N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals)
# Values for N_TYPE. (incomplete!)
N_UNDF = 0x0 # undefined, n_sect == NO_SECT
N_ABS = 0x2 # absolute, n_sect == NO_SECT
N_SECT = 0xe # defined in section number n_sect
NO_SECT = 0
MAX_SECT = 255
end
end

View file

@ -0,0 +1,31 @@
require 'compiler/macho'
class Compiler
module MachO
class Symbol
attr_accessor :name, :type, :segnum, :desc, :value
def initialize(name, type, segnum, desc, value)
@name = name
@type = type
@segnum = segnum
@desc = desc
@value = value
end
def to_nlist(strx)
Nlist.new(strx, @type, @segnum, @desc, @value)
end
def to_s
@name
end
end
end
end

View file

@ -0,0 +1,88 @@
require 'compiler/macho/structs'
require 'compiler/macho/symbol'
require 'compiler/asm/symbol_table'
class Compiler
module MachO
class SymbolTable < Assembler::SymbolTable
def make_symbols(vars, base_addr, type, segnum)
# Note: Sorting a Ruby hash gives an alist, e.g. [[<key>, <value>], ...]
# We can use map on it as if it were a hash so it works nicely.
vars.sort { |a,b| a[1] <=> b[1] }.
map do |name, offset|
Symbol.new(name, type, segnum, 0, base_addr + offset)
end
end
def all_symbols
# TODO FIXME:
# - the last var exported ends up after main somewhere... WTF?!
# - All labels are exported. This should be changed and only functions exported!
section = 1
# Functions (section #1, __text)
symbols = make_symbols(@labels, text_offset, N_SECT | N_EXT, section)
section += 1
# Constants (section #2, __const)
if @consts.size > 0
symbols += make_symbols(@consts, const_offset, N_SECT, section)
section += 1
end
# Variables (section #3, __bss)
if @vars.size > 0
symbols += make_symbols(@vars, bss_offset, N_SECT, section)
end
return symbols
end
# this is fairly stupid but works
def bss_section
@consts.size > 0 ? 3 : 2
end
def nlist_ary
symbols = {}
strx = 1
ary = []
all_symbols.each do |sym|
key = sym.name.to_sym
unless symbols.has_key?(key)
symbols[key] = strx
strx += sym.name.length + 1 # +1 for the null byte
end
ary << sym.to_nlist(symbols[key])
end
return ary
end
def stab
# The empty strings result in a string that begins and ends with a null byte
['', all_symbols, ''].flatten.map { |sym| sym.to_s }.join("\0")
end
def reloc(r_address, r_symbolnum = 0, r_length = 2, r_extern = 0, r_pcrel = 0, r_type = 0)
r_info = (r_type << 28) | (r_extern << 27) | (r_length << 25) |
(r_pcrel << 24) | r_symbolnum
@reloc_info << RelocationInfo.new(r_address, r_info)
end
def reloc_info
n = bss_section
@reloc_info.each {|r| r[:r_info] |= n}
end
def calculate_offsets(text_size)
@const_offset = @text_offset + text_size
@bss_offset = @const_offset + @const_size
end
end
end
end

View file

@ -0,0 +1,28 @@
class Compiler
module ASM
class UnimplementedMethodError < RuntimeError; end
# Abstract base class.
class ObjWriter
def write!(filename)
File.open(filename, 'wb') do |file|
file.print(serialize)
end
end
def fail(name)
raise UnimplementedMethodError.new(name)
end
# These methods must be defined for most uses of the library.
%w[header segment section text data bss symtab serialize].each do |name|
define_method(name) { fail(name) }
end
end
end
end

View file

@ -0,0 +1,70 @@
class Compiler
module ASM
# Acts like a register and can be used as the base or index in an
# effective address.
#
# e.g. [EAX] or [ESI+EBX] or [EAX + 0xff] or [EAX + EDX * 2]
class RegisterProxy
attr_reader :name, :size, :regnum
attr_reader :base, :index, :scale
def initialize(name, size, regnum)
@name = name # attrs are read-only so sharing is ok
@size = size
@regnum = regnum
@base = self
end
def +(index)
raise "index already specified" if @index
new_reg = self.clone
new_reg.instance_variable_set('@index', index)
new_reg
end
def *(scale)
raise "index must come first" unless @index
raise "scale already specified" if scale
raise "unsupported scale: #{scale}" unless scale.to_s.match(/^[1248]$/)
@scale = scale
self
end
def scale?
@scale
end
def index?
@index
end
def register?
@scale.nil? && @index.nil?
end
def to_s
[ @name.to_s,
@index && "+#{@index}",
@scale && "*#{@scale}"
].compact.join
end
def inspect
to_s
end
end
end
end

View file

@ -0,0 +1,99 @@
class Compiler
module ASM
# Abstract symbol table.
#
# Basically a big map of variable, constant, and label names to
# offsets within their respective sections. Final addresses are
# calculated from these offsets on the 2nd pass when we know where
# things will actually live in memory.
class SymbolTable
attr_accessor :text_offset, :bss_offset, :const_offset
attr_reader :const_data, :const_size, :bss_size, :reloc_info
def initialize
@vars = {} # Map of variable names to offsets. (bss vars)
@consts = {} # Map of constant names to offsets.
@funcs = {} # map of function names to offsets.
# Initial data to load into memory (data for __DATA segment).
@const_data = ''
@const_size = 0 # Size of const section.
@bss_size = 0 # Size of bss section.
# Map names to locations.
@labels = Hash.new {|h, key| raise "undefined label: #{key}"}
@num_labels = 0 # Used to generate unique labels.
@num_labels_with_suffix = Hash.new(0)
# Relocation info. Subclasses should define a reloc method.
@reloc_info = []
@text_offset = 0
@bss_offset = 0
@const_offset = 0
end
# Generate a unique label.
def unique_label(suffix = nil)
@num_labels += 1
if suffix
@num_labels_with_suffix[suffix] += 1
suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
end
name = "L#{sprintf "%06d", @num_labels}#{suffix}"
return name
end
def define_label(name, offset)
@labels[name] = offset
return name
end
def lookup_label(name)
@labels[name]
end
def define_var(name, bytes)
@vars[name] = @bss_size
@bss_size += bytes
end
def define_const(name, value, bytes)
@consts[name] = @const_size
@const_size += bytes
@const_data << [value].pack('i')
end
def define_func(name, offset)
@funcs[name] = offset
end
def var(name)
@vars[name]
end
def var?(name)
@vars.has_key?(name)
end
def const(name)
@consts[name]
end
def const?(name)
@consts.has_key?(name)
end
end
end
end

View file

@ -0,0 +1,73 @@
# sjs
# may 2009
require 'compiler/asm/assembler'
class Compiler
module ASM
class TextAssembler < Assembler
def initialize(delegate)
super(delegate)
@vars = {} # Symbol table, maps names to locations in BSS.
@data = ''
@bss = ''
@code = ''
unless File.readable?(template_filename)
raise "unsupported platform/arch: #{delegate.platform}/#{arch.name}"
end
end
def template_filename
@template_filename ||= File.join(File.dirname(__FILE__), arch.name, "template.#{delegate.platform}.asm")
end
# Define a constant
def const(name, value)
end
# Define a variable with the given name and size in bytes.
def define_var(name, bytes = arch.bytes)
unless var?(name)
define_var_impl(name, bytes)
else
STDERR.puts "[warning] attempted to redefine #{name}"
end
end
def define_var_impl(name, bytes = arch.bytes)
end
def var(name)
@vars[name]
end
alias_method :var?, :var
# Emit a line of code wrapped between a tab and a newline.
def emit(code, options = {})
tab = options.has_key?(:tab) ? options[:tab] : "\t"
@code << "#{tab}#{code}\n"
end
def label(name = nil)
# FIXME
name = super
@labels[name] = name
return name
end
def output
end
def emit_label(name = label)
emit("#{name}:", tab: nil)
end
end
end
end

View file

@ -0,0 +1,43 @@
class Compiler
module ASM
# Wrap a variable's address so that we can perform arithmetic on it
# before resolving it when we know where things will go in memory.
# All we do is catch arithmetic ops and then provide a means to
# resolve a final addres by replaying them later.
#
# e.g. [symtab.var('i')] or [symtab.var('i') * 2]
class VariableProxy
attr_reader :name
attr_accessor :ops
def initialize(name)
@name = name
@ops = []
end
%w[+ * / - % & |].each do |op|
define_method(op) do |*args|
new_proxy = self.class.new(@name)
new_proxy.ops << [op, *args]
return new_proxy
end
end
# XXX should this perhaps use the offset instead?
def resolve(base_addr)
@ops.inject(base_addr) do |addr, op|
addr.send(*op)
end
end
# Overriden by ConstantProxy
def const?
false
end
end
end
end

View file

@ -0,0 +1,42 @@
require 'compiler/asm/arch'
class Compiler
module ASM
module X86
module Arch
BINARY_PREAMBLE = {
'linux' => [],
'darwin' => [ 0x55, # push ebp
0x89, 0xe5, # mov ebp, esp
0x81, 0xec, 8, 0, 0, 0 # sub esp, 8
]
}
BINARY_POSTAMBLE = {
'linux' => [ 0x89, 0xc3, # mov ebx, eax (exit code)
0xb8, 1, 0, 0, 0, # mov eax, 1
0xcd, 0x80 # int 0x80
],
'darwin' => [ 0xc9, # leave
0xc3 # ret
]
}
def self.instance
@instance ||= ASM::Arch.new({
'bits' => 32,
'word_bits' => 16,
'preamble' => BINARY_PREAMBLE,
'postamble' => BINARY_POSTAMBLE
})
end
end
end
end
end

View file

@ -0,0 +1,866 @@
# A very basic x86 assembler library for Ruby. Generally the
# instructions implemented are the minimum needed by the compiler this
# is written for. x86 is just too big.
#
# sjs
# may 2009
#
# Refer to the Intel[1] or AMD documentationon on x86 for explanations
# of Mod-R/M encoding, the Scale-Index-Base (SIB) byte, opcode groups.
#
# The start and exit shell codes were obtained by disassembling
# minimal binaries on the respective platforms.
require 'json'
require 'compiler/asm/binary_assembler'
require 'compiler/asm/x86/arch'
require 'compiler/asm/x86/registers'
class Compiler
module ASM
module X86
class BinaryAssembler < ASM::BinaryAssembler
include Registers
DEBUG_OUTPUT = false
SIGNED_BYTE = -128..127
# This is used for encoding instructions. Just as the equivalent
# assembly would contain "BITS 32", binary is generated for 32-bit
# protected mode.
DEFAULT_OPERAND_SIZE = :dword
SIZE_MAP = {
byte: 8,
word: 16,
dword: 32
}
def emit_entry_point
# Always include the _main entry point in our symbol table. It begins at the
# beginning of the __TEXT segment, 0x0.
define_label('_main')
end
# register for return values
def return_reg
EAX
end
### Virtual ISA used by parser.
def load(n)
mov(return_reg, n)
end
def load_var(name)
mov(return_reg, [var(name)])
end
def store_var(name, reg)
mov([name], reg)
end
# stack_* methods expect op1 on the stack
def stack_add(reg)
pop(EBX)
add(reg, EBX)
end
def stack_sub(reg)
pop(EBX)
sub(reg, EBX)
end
def stack_mul_signed(reg)
pop(EBX)
imul(EBX)
end
def stack_div(reg)
pop(EBX) # Get op1
xchg(reg, EBX) # Swap the divisor and dividend into
# the correct places.
# idiv uses edx:eax as the dividend so we need to ensure that edx
# is correctly sign-extended w.r.t. eax.
cdq # Sign-extend eax into edx (Convert Double to Quad).
idiv(EBX) # Divide a (eax) by b (ebx).
end
def stack_or(reg)
pop(EBX)
self.or(reg)
end
def stack_xor(reg)
pop(EBX)
xor(reg)
end
def stack_and(reg)
pop(EBX)
self.and(reg)
end
def compare(reg, n)
cmp(reg, n)
end
def mov_reg_imm(reg, imm)
mov(reg, imm)
end
############################
### Instruction Encoding ###
############################
def emit_dword(num)
num_to_quad(num).each { |byte| emit_byte(byte) }
end
def emit_modrm(addr, reg = 0)
mod = 0
rm = 0
disp8 = nil
disp32 = nil
sib = nil
var = nil # variable proxy
# effective address
if addr.is_a?(Array)
eff_addr = addr[1] || addr[0] # works with or without size prefix
raise "invalid effective address: #{addr.inspect}" unless eff_addr
case eff_addr
when RegisterProxy
# Simple register addressing, e.g. [ESI].
#
# mod == 00
if eff_addr.register?
mod = 0
# [ESP] and [EBP] can't be encoded directly. The
# workaround is to use SIB to emit the code for [ESP+0]
# and [EBP+0] instead.
#
# To emit [ESP+0] we use SIB with scale=1 index=0 base=ESP.
if eff_addr == ESP
rm = 4 # SIB
sib = make_sib(1, 0, eff_addr)
# For [EBP+0] we can encode [EBP]+disp8 directly.
elsif eff_addr == EBP
mod = 1
rm = eff_addr.regnum
disp8 = 0
else
rm = eff_addr.regnum
end
# Bare displacements, e.g. [32] or [0x1234abcd]
elsif eff_addr.index? && eff_addr.index.is_a?(Numeric)
# disp8, mod == 01
if SIGNED_BYTE === eff_addr.index
mod = 1
disp8 = eff_addr.index
# disp32, mod == 10
elsif SignedRange === eff_addr.index
mod = 2
disp32 = eff_addr.index
else
raise "address must fit in 32 bits, this doesn't: #{eff_addr.index}"
end
# SIB
elsif eff_addr.index?
# scale-index-base, mod == 00 and rm == 100
rm = 4
sib = make_sib(eff_addr.scale || 1, eff_addr.index, eff_addr.base)
else
raise "unsupported effective address: #{addr.inspect}"
end
# disp32, mod == 00
when Numeric
mod = 0
rm = 5 # 101
disp32 = eff_addr
when VariableProxy
mod = 0
rm = 5
var = eff_addr
else
raise "unsupported effective address: #{addr.inspect}"
end
# register content, mod == 11
elsif addr.register?
mod = 3
rm = addr.regnum
# XXX TODO elsif addr.respond_to?(:name)
# (VariableProxy) => [:(var|const), addr.name]
#
# i.e. a pointer to that var
else
raise "unsupported effective address: #{addr.inspect}"
end
emit_byte((mod << 6) | (reg << 3) | rm)
emit_byte(sib) if sib
emit_byte(disp8) if disp8
emit_dword(disp32) if disp32
emit_var(var) if var
end
def make_sib(scale, index, base)
if [1,2,4,8].include?(scale)
scale = log2(scale).to_i
else
raise "unsupported SIB scale: #{scale}, should be 1, 2, 4, or 8"
end
if index == 0
index = 4
elsif index.respond_to?(:regnum)
index = index.regnum
end
base = base.regnum if base.respond_to?(:regnum)
return (scale << 6) | (index << 3) | base
end
def register?(op, size = DEFAULT_OPERAND_SIZE)
op.is_a?(RegisterProxy) && op.size == size ||
op.respond_to?(:size) && op.size == SIZE_MAP[size]
end
def immediate?(op, size = DEFAULT_OPERAND_SIZE)
bits = SIZE_MAP[size] || size
op.is_a?(Numeric) && op >= -(2 ** bits / 2) && op <= (2 ** bits - 1)
end
# Return true if op is a valid operand of the specified size.
# (:byte, :word, :dword)
#
# Valid operands are:
#
# * registers
#
# * effective addresses (wrapped in an array to look like nasm code)
#
# XXX This method is pretty ugly.
def rm?(op, size = DEFAULT_OPERAND_SIZE)
is_register = register?(op, size)
if op.is_a?(Array)
case op.size
# [register/memory]
when 1
is_reg_or_mem = [Numeric, RegisterProxy, VariableProxy].include?(op[0].class)
# [<size>, memory]
when 2
is_size_and_mem = op[0] == size && [Numeric, RegisterProxy, VariableProxy].include?(op[1].class)
end
else
is_reg_or_mem = false
is_size_and_mem = false
end
is_register || is_reg_or_mem || is_size_and_mem
end
def offset?(addr, size = DEFAULT_OPERAND_SIZE)
addr.is_a?(Array) && (addr[0].is_a?(Numeric) || addr[0].is_a?(VariableProxy))
end
def constant?(op)
immediate?(op) || offset?(op)
end
def log2(x, tol = 1e-13)
result = 0.0
# Integer part
while x < 1
resultp -= 1
x *= 2
end
while x >= 2
result += 1
x /= 2
end
# Fractional part
fp = 1.0
while fp >= tol
fp /= 2
x *= x
if x >= 2
x /= 2
result += fp
end
end
result
end
# 9 versions of the mov instruction are supported:
# 1. mov reg32, immediate32
# 2a. mov reg32, r/m32
# 2b. mov eax, memoffset32
# 3a. mov r/m32, reg32
# 3b. mov memoffset32, eax
# 4. mov r/m32, immediate32
# 5. mov r/m8, imm8
# 6. mov reg8, r/m8
# 7. mov r/m8, reg8
def mov(dest, src)
# These 2 are used in the same way, just the name differs to make the
# meaning clear. They are 4-byte values that are emited at the end if
# they are non-nil. Only one of them will be emited, and if both are
# non-nil that one is immediate.
immediate = nil
offset = nil
# This is an array of arguments to be passed to emit_modrm, if it is set.
modrm = nil
# version 1: mov r32, imm32
if register?(dest) && immediate?(src)
opcode = 0xb8 + dest.regnum # dest encoded in instruction
immediate = src
# version 2a: mov r32, r/m32
elsif register?(dest) && rm?(src)
# version 2b: mov eax, moffs32
if dest == EAX && offset?(src)
opcode = 0xa1
offset = src[0]
else
opcode = 0x8b
modrm = [src, dest.regnum]
end
# version 3a: mov r/m32, r32
elsif rm?(dest) && register?(src)
# version 3b: mov moffs32, eax
if offset?(dest) && src == EAX
opcode = 0xa3
offset = dest[0]
else
opcode = 0x89
modrm = [dest, src.regnum]
end
# version 4: mov r/m32, imm32
elsif rm?(dest) && immediate?(src)
opcode = 0xc7
modrm = [dest, 0]
immediate = src
# version 5: mov r/m8, imm8
#
# It's important that this check is first because src integers can
# pass the register? check in version 7.
elsif rm?(dest, :byte) && immediate?(src, :byte)
opcode = 0xc6
modrm = [dest, 0]
immediate_byte = src
# version 6: mov r8, r/m8
elsif register?(dest, :byte) && rm?(src, :byte)
opcode = 0x8a
modrm = [src, dest.regnum]
# version 7: mov r/m8, r8
elsif rm?(dest, :byte) && register?(src, :byte)
opcode = 0x88
modrm = [dest, src.regnum]
else
# puts "rm?(dest): #{rm?(dest)}\t\trm?(src): #{rm?(src)}"
# puts "register?(dest): #{register?(dest)}\t\tregister?(src): #{register?(src)}"
# puts "immediate?(dest): #{immediate?(dest)}\t\timmediate?(src): #{immediate?(src)}"
# puts "offset?(dest): #{offset?(dest)}\t\toffset?(src): #{offset?(src)}"
# puts "rm?(dest, :byte): #{rm?(dest)}\t\trm?(src, :byte): #{rm?(src, :byte)}"
# puts "immediate?(dest, :byte): #{immediate?(dest)}\t\timmediate?(src, :byte): #{immediate?(src, :byte)}"
raise "unsupported MOV instruction, #{dest.inspect}, #{src.inspect}"
end
dword = immediate || offset
asm do
emit_byte(opcode)
emit_modrm(*modrm) if modrm
if dword.is_a?(VariableProxy)
if dword.const?
emit_const(dword)
else
emit_var(dword)
end
elsif dword
emit_dword(dword)
elsif immediate_byte
emit_byte(immediate_byte)
end
end
end
def movzx(dest, src)
# movzx Gv, ??
if register?(dest)
opcode = case
when rm?(src, :byte)
0xb6 # movzx Gv, Eb
when rm?(src, :word)
0xb7 # movzx Gv, Ew
else
raise "unsupported MOVZX instruction, dest=#{dest.inspect} << src=#{src.inspect} >>"
end
asm do
emit_byte(0x0f)
emit_byte(opcode)
emit_modrm(src, dest.regnum)
end
else
raise "unimplemented MOVZX instruction, << dest=#{dest.inspect} >> src=#{src.inspect}"
end
end
def xchg(dest, src)
if dest == EAX && register?(src)
asm { emit_byte(0x90 + src.regnum) }
# swap the args if EAX comes last so we only need to handle one case below.
elsif src == EAX && register?(dest)
xchg(src, dest)
elsif rm?(dest) && register?(src)
asm do
emit_byte(0x87)
emit_modrm(dest, src.regnum)
end
elsif register?(dest) && rm?(src)
asm do
emit_byte(0x87)
emit_modrm(src, dest.regnum)
end
else
raise "unsupported XCHG instruction, dest=#{dest.inspect} src=#{src.inspect}"
end
end
# convert double to quad (sign-extend EAX into EDX)
def cdq
asm { emit_byte(0x99) }
end
def add(dest, src)
# add r/m32, imm8
if rm?(dest) && immediate?(src, :byte)
asm do
emit_byte(0x83)
emit_modrm(dest, 0)
emit_byte(src)
end
# add r/m32, imm32
elsif rm?(dest) && immediate?(src)
asm do
emit_byte(0x81)
emit_modrm(dest, 0)
emit_dword(src)
end
# add eax, imm32
elsif dest == EAX && immediate?(src)
asm do
emit_byte(0x05)
emit_dword(src)
end
# add reg32, r/m32
elsif register?(dest) && rm?(src)
asm do
emit_byte(0x03)
emit_modrm(src, dest.regnum)
end
else
raise "unsupported ADD instruction, dest=#{dest.inspect} src=#{src.inspect}"
end
end
def sub(dest, src)
# sub r/m32, imm8
if rm?(dest) && immediate?(src, :byte)
asm do
emit_byte(0x83)
emit_modrm(dest, 5)
emit_byte(src)
end
# sub r/m32, imm32
elsif rm?(dest) && immediate?(src)
asm do
emit_byte(0x81)
emit_modrm(dest, 5)
emit_dword(src)
end
# sub r/m32, reg32
elsif rm?(dest) && register?(src)
asm do
emit_byte(0x29)
emit_modrm(dest, src.regnum)
end
# sub reg32, r/m32
elsif register?(dest) && rm?(src)
asm do
emit_byte(0x2b)
emit_modrm(src, dest.regnum)
end
else
raise "unsupported SUB instruction, dest=#{dest.inspect} src=#{src.inspect}"
end
end
# Signed multiply.
def imul(*ops)
case ops.size
when 1
group3(ops[0], 5, 'IMUL')
when 2
dest, src = ops
raise "unsupported IMUL instruction, dest=#{dest.inspect} src=#{src.inspect}"
else
raise ArgumentError, "IMUL accepts exactly 1 or 2 operands (got #{ops.inspect})"
end
end
# Unsigned multiply.
def mul(op)
group3(op, 4, 'MUL')
end
# Signed divide.
def idiv(op)
group3(op, 7, 'IDIV')
end
# Unsigned divide.
def div(op)
group3(op, 6, 'DIV')
end
def inc(op)
asm do
if register?(op)
emit_byte(0x40 + regnum(op))
elsif rm?(op)
# emit_byte(0xff)
raise "unimplemented"
else
raise "unsupported op #{op}, wanted r32 or r/m32"
end
end
end
def dec(op)
if register?(op)
# dec reg32
asm { emit_byte(0x48 + op.regnum) }
else
raise "unsupported DEC instruction, op=#{op.inspect}"
end
end
def shr(op, n)
# shr r/m??, imm8
if SIGNED_BYTE === n
opcode = register?(op, :byte) ? 0xc0 : 0xc1
asm do
emit_byte(opcode)
emit_modrm(op, 5)
emit_byte(n)
end
else
raise "unsupported SHR instruction, op=#{op.inspect}, n=#{n.inspect}"
end
end
def and_(dest, src)
if rm?(dest) && register?(src)
asm do
emit_byte(0x21)
emit_modrm(dest, src.regnum)
end
elsif rm?(dest, 8) && immediate?(src, 8)
asm do
emit_byte(0x80)
emit_modrm(dest, 4)
emit_byte(src)
end
else
raise "unsupported AND instruction: dest=#{dest.inspect}, src=#{src.inspect}"
end
end
alias_method :and, :and_
def or_(dest, src)
if rm?(dest) && register?(src)
asm do
emit_byte(0x9)
emit_modrm(dest, src.regnum)
end
elsif rm?(dest, 8) && immediate?(src, 8)
asm do
emit_byte(0x80)
emit_modrm(dest, 1)
emit_byte(src)
end
else
raise "unsupported OR instruction: dest=#{dest.inspect}, src=#{src.inspect}"
end
end
alias_method :or, :or_
def xor(dest, src)
# xor r/m32, reg32
if rm?(dest) && register?(src)
asm do
emit_byte(0x31)
emit_modrm(dest, src.regnum)
end
else
raise "unsupported XOR instruction, dest=#{dest.inspect} src=#{src.inspect}"
end
end
def not_(op)
group3(op, 2, 'NOT')
end
alias_method :not, :not_
def neg(op)
group3(op, 3, 'NEG')
end
def push(op)
# push reg32
if register?(op)
asm { emit_byte(0x50 + op.regnum) }
elsif immediate?(op, :byte)
asm do
emit_byte(0x6a)
emit_byte(op)
end
elsif immediate?(op)
asm do
emit_byte(0x68)
emit_dword(op)
end
else
raise "unsupported PUSH instruction: op=#{op.inspect}"
end
end
def pop(op)
# pop reg32
if register?(op)
asm { emit_byte(0x58 + op.regnum) }
else
raise "unsupported POP instruction: op=#{op.inspect}"
end
end
def cmp(op1, op2)
# cmp r/m32, reg32
if rm?(op1) && register?(op2)
asm do
emit_byte(0x39)
emit_modrm(op1, op2.regnum)
end
# cmp eax, imm32
elsif op1 == EAX && immediate?(op2)
asm do
emit_byte(0x3d)
emit_dword(op2)
end
else
raise "unsupported CMP instruction: op1=#{op1.inspect} op2=#{op2.inspect}"
end
end
# Only jmp rel32 is supported.
def jmp(label)
asm do
emit_byte(0xe9)
emit_label(label)
end
end
# These all jump near (rel32).
JccOpcodeMap = Hash.new { |key| raise "unsupported Jcc instruction: #{key}" }.
merge({
:jc => 0x82, # carry (CF=1)
:je => 0x84, # equal (ZF=1) --- same as jz
:jg => 0x8f, # greater (ZF=0 and SF=OF)
:jl => 0x8c, # less than (SF!=OF)
:jne => 0x85, # not equal (ZF=0) --- same as jnz
:jng => 0x8e, # not greater than (ZF=1 or SF!=OF)
:jnl => 0x8d, # not less than (SF=OF)
:jnz => 0x85, # not zero (ZF=0)
:jo => 0x80, # overflow (OF=1)
:js => 0x88, # sign (SF=1)
:jz => 0x84 # zero (ZF=1)
})
# Only Jcc rel32 is supported.
def jcc(instruction, label)
opcode = JccOpcodeMap[instruction]
asm do
emit_byte(0x0f)
emit_byte(opcode)
emit_label(label)
end
end
JccOpcodeMap.keys.each do |name|
define_method(name) do |label|
jcc(name, label)
end
end
def lea(r32, mem)
asm do
emit_byte(0x8d)
emit_modrm(mem, r32.regnum)
end
end
def int(n)
asm do
emit_byte(0xcd)
emit_byte(n)
end
end
def ret
asm { emit_byte(0xc3) }
end
def leave
asm { emit_byte(0xc9) }
end
# NOTE: LOOP only accepts a 1-byte signed offset. Don't use it.
def loop_(label)
real_ip = ip + 2 # loop instruction is 2 bytes
delta = @symtab.lookup_label(label) - real_ip
unless SIGNED_BYTE === delta
raise "LOOP can only jump -128 to 127 bytes, #{label} is #{delta} bytes away"
end
asm do
emit_byte(0xe2)
emit_byte(delta)
end
end
alias_method :loop, :loop_
# Opcode group #3. 1-byte opcode, 1 operand (r/m8 or r/m32).
#
# Members of this group are: DIV, IDIV, MUL, IMUL, NEG, NOT, and TEST.
def group3(op, reg, instruction)
opcode =
if rm?(op, 8)
0xf6
elsif rm?(op)
0xf7
else
raise "unsupported #{instruction} instruction: op=#{op.inspect}"
end
asm do
emit_byte(opcode)
emit_modrm(op, reg)
end
end
end
end
end
end

View file

@ -0,0 +1,32 @@
require 'asm/regproxy'
module ASM
module Registers
# This structure allows for x86 registers of all sizes. The
# number of the register is the index of the array in which it was
# found. The size of a register in bytes is 2 ** index-into-sub-array.
Registers = [ [:al, :ax, :eax], # 0
[:cl, :cx, :ecx], # 1
[:dl, :dx, :edx], # 2
[:bl, :bx, :ebx], # 3
[:ah, :sp, :esp], # 4
[:ch, :bp, :ebp], # 5
[:dh, :si, :esi], # 6
[:bh, :di, :edi] # 7
]
# Setup register proxies which are used both in effective address
# calculations, and also just as symbols representing registers.
Registers.each_with_index do |group, regnum|
group.each_with_index do |reg, i|
name = reg.to_s.upcase
const_set(name, RegisterProxy.new(reg, 8 * (2 ** i), regnum))
end
end
end
end

View file

@ -0,0 +1,11 @@
BITS 32
GLOBAL _main
SECTION .data
{data}
SECTION .bss
{bss}
SECTION .text
_main:
{code}
;; The result in eax is the exit code, just return.
ret

View file

@ -0,0 +1,13 @@
BITS 32
GLOBAL _start
SECTION .data
{data}
SECTION .bss
{bss}
SECTION .text
_start:
{code}
;; The result in eax is the exit code, move it to ebx.
mov ebx, eax
mov eax, 1 ; _exit syscall
int 0x80 ; call Linux

View file

@ -0,0 +1,159 @@
# A subset of x86 assembly.
#
# sjs
# may 2009
require 'compiler/asm/text_assembler'
class Compiler
module ASM
module X86
# ASM methods output nasm-friendly x86 asm code, line by
# line. This is dead easy and we can trust nasm to compile
# correct machine code, which isn't trivial.
class TextAssembler < ASM::TextAssembler
def emit_entry_point
end
# Define a constant in the .data section.
def const(name, value)
@data << "#{name}\tequ #{value}"
end
# Define a variable with the given name and size in bytes.
def define_var_impl(name, bytes = nil)
super(name, bytes)
dwords = bytes / 4
@bss << "#{name}: resd #{dwords}\n"
end
def output
File.read(template_filename).
sub("{data}", @data).
sub("{bss}", @bss).
sub("{code}", @code)
end
def emit_label(name = label)
emit("#{name}:", tab: nil)
end
def mov(dest, src)
emit("mov #{dest}, #{src}#{src.is_a?(Numeric) ? " ; 0x#{src.to_s(16)}" : ''}")
end
def movzx(dest, src)
emit("movzx #{dest}, #{src}")
end
def add(dest, src)
emit("add #{dest}, #{src}")
end
def sub(dest, src)
emit("sub #{dest}, #{src}")
end
def imul(op)
emit("imul #{op}")
end
def idiv(op)
emit("idiv #{op}")
end
def inc(op)
emit("inc #{op}")
end
def dec(op)
emit("dec #{op}")
end
def push(reg)
emit("push #{reg}")
end
def pop(reg)
emit("pop #{reg}")
end
def call(label)
emit("call #{label}")
end
def leave
emit("leave")
end
def neg(reg)
emit("neg #{reg}")
end
def not(rm32)
emit("not #{rm32}")
end
def xchg(op1, op2)
emit("xchg #{op1}, #{op2}")
end
def and_(op1, op2)
emit("and #{op1}, #{op2}")
end
def or(op1, op2)
emit("or #{op1}, #{op2}")
end
def xor(op1, op2)
emit("xor #{op1}, #{op2}")
end
def jz(label)
emit("jz #{label}")
end
def jnz(label)
emit("jnz #{label}")
end
def jmp(label)
emit("jmp #{label}")
end
def jl(label)
emit("jl #{label}")
end
def cmp(a, b)
emit("cmp #{a}, #{b}")
end
def lea(a, b)
emit("lea #{a}, #{b}")
end
def shr(a, b)
emit("shr #{a}, #{b}")
end
def loop_(label)
emit("loop #{label}")
end
def int(num)
emit("int 0x#{num.to_s(16)}")
end
def cdq
emit("cdq")
end
end
end
end
end

108
lib/compiler/build.rb Executable file
View file

@ -0,0 +1,108 @@
#!/usr/bin/env ruby
require 'compiler'
# usage: build.rb <filename> [output filename] [elf | macho] [asm | bin]
BIN_FORMATS = Hash.new('bin')
BIN_FORMATS['darwin'] = 'macho'
BIN_FORMATS['linux'] = 'elf'
def main
filename = ARGV.shift.to_s
raise "can't read #{filename}" unless File.readable?(filename)
outdir = ARGV.shift || '.'
platform = `uname -s`.chomp.downcase
binformat = ARGV[1] ? ARGV[1].downcase : BIN_FORMATS[platform]
puts "Building #{filename} for #{platform}, binformat is #{binformat} ..."
outfile = build(filename, outdir, platform, binformat)
puts outfile
exit
end
def error(msg) STDERR.puts(msg) end
# name part (filename minus extension)
def base(filename)
filename.sub(/\.[^.]*$/, '')
end
# infile: input filename
# outfile: output filename
# asm: assembler to use
def compile(infile, outfile, asm)
File.open(infile, 'r') do |input|
File.open(outfile, 'wb') do |out|
out.print(Compiler.compile(input, asm))
end
end
rescue ParseError => e
error("[error] #{e.message}")
error("[context] #{e.context}")
error(e.caller)
exit(1)
end
def run_and_warn_on_failure(command)
output = `#{command}`
if $?.exitstatus != 0
puts
print output
name = command.split.first
raise "#{name} failed: #{$?.exitstatus}"
end
end
# link with ld, return resulting filename.
def link(filename, outdir, platform = 'linux')
f = base(filename)
cmd, args = *case platform
when 'darwin'
['gcc', '-arch i386']
when 'linux'
['ld', '']
else
raise "unsupported platform: #{platform}"
end
run_and_warn_on_failure("#{cmd} #{args} -o #{f} #{filename} 2>&1")
`chmod u+x #{f}`
return f
end
def build(filename, outdir, platform = 'linux', binformat = 'macho')
objfile = File.join(outdir, base(filename) + '.o')
symtab, objwriter_class =
case binformat
when 'elf'
[Compiler::ELF::SymbolTable.new, Compiler::ELF::ObjectFile]
when 'macho'
[Compiler::MachO::SymbolTable.new, Compiler::MachO::ObjectFile]
else
raise "unsupported binary format: #{binformat}"
end
compile(filename, objfile, Compiler::ASM::Binary.new(platform, symtab, objwriter_class))
exefile = link(objfile, outdir, platform)
return exefile
end
def build_asm(filename, outdir, platform = 'linux', binformat = 'macho')
asmfile = File.join(outdir, base(filename) + '.asm')
compile(filename, asmfile, Compiler::ASM::Text.new(platform))
objfile = assemble(asmfile, binformat)
exefile = link(objfile, platform)
return exefile
end
# assemble using nasm, return resulting filename.
def assemble(filename, binformat = 'macho')
f = base(filename)
outfile = "#{f}.o"
run_and_warn_on_failure("nasm -f #{binformat} -g -o #{outfile} #{filename} 2>&1")
return outfile
end
main if $0 == __FILE__

View file

@ -0,0 +1,14 @@
class Compiler
class ParseError < RuntimeError
attr_reader :caller, :context
def initialize(caller, context = nil)
@caller = caller
@context = context
end
end
end

966
lib/compiler/parser.rb Normal file
View file

@ -0,0 +1,966 @@
# A compiler as described by Jack Crenshaw in his famous book "Let's
# Build a Compiler". At least in the beginning, this code will
# closely reflect the Pascal code written by Jack. Over time it may
# become more idiomatic, however this is an academic exercise.
#
# sjs
# may 2009
require 'compiler/parse_error'
class Compiler
class Parser
KEYWORDS = {
'if' => :if_else_stmt,
'while' => :while_stmt,
'until' => :until_stmt,
'repeat' => :repeat_stmt,
'for' => :for_stmt,
'do' => :do_stmt,
'break' => :break_stmt,
'print' => :print_stmt,
'else' => nil,
'end' => nil
}
# Grouped by precedence.
OPS = {
:add => %w[+ -],
:mul => %w[* /],
:rel => %w[== != < > <= >=],
:or => %w[||],
:and => %w[&&],
:bit => %w[| ^ &],
:unary => %w[- +]
}
# Op chars are chars that can begin an op, so OP_CHARS needs to be a
# map of kinds of operators to a list of valid prefix chars.
OP_CHARS = OPS.inject({}) { |hash, kv|
key, val = *kv
hash[key] = val.map {|op| op[0, 1]} # slice off first char for each op
hash
# Include :all for a very general test.
}.merge(:all => OPS.values.flatten.map{|op| op[0, 1]}.sort.uniq)
FALSE = 0
TRUE = -1
attr_reader :asm
def initialize(input, asm)
@indent = 0 # for pretty printing
@look = '' # Next lookahead char.
@token = nil # Type of last read token.
@value = nil # Value of last read token.
@input = input # Stream to read from.
@asm = asm # assembler
@keywords = KEYWORDS.clone
@keyword_names = @keywords.keys
@label_stack = []
# seed the lexer
get_char
end
def parse
block # parse a block of code
expected(:'end of file') unless eof?
end
def compile
asm.output
end
# Scan the input stream for the next token.
def scan
return if @look.nil? # eof
if alpha?(@look)
get_name
elsif digit?(@look)
get_number
elsif op_char?(@look)
get_op
elsif newline?(@look)
skip_any_whitespace
scan
elsif comment_char?(@look)
skip_comment
scan
else
# XXX default to single char op... should probably raise.
@token = :op
@value = @look
get_char
end
end
# put back the most recently parsed value
def backtrack
@input.ungetc(@look[0])
@value.reverse.each_byte {|i| @input.ungetc(i)}
get_char
end
# Parse and translate an identifier or function call.
def identifier
name = get_name
if @look == '('
# function call
match('(')
# TODO arg list
match(')')
asm.call(name)
else
# variable access
asm.load_var(name)
end
end
# Parse and translate a single factor. Result is in eax.
def factor
if @look == '('
match('(')
boolean_expression
match(')')
elsif alpha?(@look)
identifier # or call
elsif digit?(@look)
asm.load(get_number.to_i)
else
expected(:'integer, identifier, function call, or parenthesized expression', :got => @look)
end
end
# Parse a signed factor.
def signed_factor
sign = @look
match(sign) if op?(:unary, sign)
factor
asm.neg(return_reg) if sign == '-'
end
# Parse and translate a single term (factor or mulop). Result is in
# eax.
def term
signed_factor # Result in eax.
while op?(:mul, @look)
asm.push(return_reg)
case @look
when '*'
multiply
when '/'
divide
end
end
end
# Parse and translate a general expression of terms. Result is
# in eax.
def arithmetic_expression
term # Result is in eax.
while op_char?(@look, :add)
asm.push(return_reg)
case @look
when '+'
add
when '-'
subtract
end
end
end
# Parse an addition operator and the 2nd term (b). The result is
# left in eax. The 1st term (a) is expected on the stack.
def add
match('+')
term # Result is in eax.
asm.stack_add(return_reg) # Add a to b.
end
# Parse a subtraction operator and the 2nd term (b). The result is
# left in eax. The 1st term (a) is expected on the stack.
def subtract
match('-')
term # Result, b, is in eax.
asm.neg(return_reg) # Fake the subtraction. a - b == a + -b
asm.stack_add(return_reg) # Add a to -b.
end
# Parse an addition operator and the 2nd term (b). The result is
# left in eax. The 1st term (a) is expected on the stack.
def multiply
match('*')
signed_factor # Result, b, is in return_reg.
asm.stack_mul_signed(return_reg) # Multiply a by b.
end
# Parse a division operator and the divisor (b). The result is
# left in eax. The dividend (a) is expected on the stack.
def divide
match('/')
signed_factor # Result is in eax.
asm.stack_div(return_reg) # Divide a by b.
end
###################
# bit expressions #
###################
def bit_expression
arithmetic_expression
while op?(:bit, @look)
scan
case @value
when '|'
bitor_expression
when '^'
bitxor_expression
when '&'
bitand_expression
else
backtrack
return
end
end
end
def bit_op(op, token)
asm.push(return_reg)
arithmetic_expression
asm.send("stack_#{op}", return_reg)
end
def bitor_expression
bit_op(:or, '|')
end
def bitxor_expression
bit_op(:xor, '^')
end
def bitand_expression
bit_op(:and, '&')
end
#######################
# boolean expressions #
#######################
def boolean_expression
boolean_term
while @look == '|'
scan
expected('||') unless match_word('||')
false_label = asm.make_label(:false)
truthy_label = asm.make_label(:truthy)
done_label = asm.make_label(:done)
asm.compare(return_reg, FALSE)
asm.jne(truthy_label)
boolean_term
asm.compare(return_reg, FALSE)
asm.je(false_label)
asm.define_label(truthy_label)
asm.mov_reg_imm(return_reg, TRUE)
asm.jmp(done_label)
asm.define_label(false_label)
asm.mov_reg_imm(return_reg, FALSE)
asm.define_label(done_label)
end
end
def boolean_term
not_factor
while @look == '&'
scan
expected('&&') unless match_word('&&')
false_label = asm.make_label(:false)
done_label = asm.make_label(:done)
asm.compare(return_reg, FALSE)
asm.je(false_label)
not_factor
asm.compare(return_reg, FALSE)
asm.je(false_label)
asm.mov_reg_imm(return_reg, TRUE)
asm.jmp(done_label)
asm.define_label(false_label)
asm.mov_reg_imm(return_reg, TRUE)
asm.define_label(done_label)
end
end
def boolean_factor
if boolean?(@look)
if get_boolean == 'true'
asm.mov_reg_imm(return_reg, TRUE)
else
asm.xor(return_reg, return_reg)
end
scan
else
relation
end
end
def not_factor
if @look == '!'
match('!')
boolean_factor
make_boolean(return_reg) # ensure it is -1 or 0...
asm.not_(return_reg) # so that 1's complement NOT is also boolean not
else
boolean_factor
end
end
# Convert any identifier to a boolean (-1 or 0). This is
# semantically equivalent to !!reg in C or Ruby.
def make_boolean(reg)
end_label = asm.make_label(:endmakebool)
asm.compare(reg, FALSE) # if false do nothing
asm.jz(end_label)
asm.mov_reg_imm(reg, TRUE) # truthy, make it true
asm.define_label(end_label)
end
def relation
bit_expression
if op_char?(@look, :rel)
scan
asm.push(return_reg)
case @value
when '=='
eq_relation
when '!='
neq_relation
when '>'
gt_relation
when '>='
ge_relation
when '<'
lt_relation
when '<='
le_relation
end
end
end
# a: <on the stack>
# b: eax
#
# If b - a is zero then a = b, and make_boolean will leave the zero
# to effectively return false. If b - a is non-zero then a != b,
# and make_boolean will leave -1 (true) for us in eax.
def neq_relation
bit_expression
asm.stack_sub(return_reg)
make_boolean
end
# Invert the != test for equal.
def eq_relation
neq_relation
asm.not(return_reg)
end
# > and < are both implemented in terms of jl (jump if less than).
# We exploit the fact that cmp is the subtraction of src from dest
# and order the terms appropriately for each function. As for >=
# and <=, they in turn are implemented in terms of > and <. a is
# greater than or equal to b if and only if a is *not* less than b.
#
# Note: This was done to minimize the number of instructions that
# the assembler needed to implement, but since the Jcc
# instructions are very cheap to implement this is no longer
# a concern.
# The next 4 relations all compare 2 values a and b, then return
# true (-1) if the difference was below zero and false (0)
# otherwise (using JL, jump if less than).
def cmp_relation(a, b, options = {})
bit_expression
asm.pop(EBX)
# Invert the sense of the test?
invert = options[:invert]
true_label = asm.make_label(:cmp)
end_label = asm.make_label(:endcmp)
asm.compare(a, b)
asm.jl(true_label)
asm.mov(EAX, FALSE) # return false
asm.not_(EAX) if invert # (or true if inverted)
asm.jmp(end_label)
asm.define_label(true_label)
asm.mov(EAX, FALSE) # return true
asm.not_(EAX) unless invert # (or false if inverted)
asm.define_label(end_label)
end
# a: <on the stack>
# b: eax
#
# if a > b then b - a < 0
def gt_relation
TODO: fix this
cmp_relation(EAX, EBX) # b - a
end
# a: <on the stack>
# b: eax
#
# if a < b then a - b < 0
def lt_relation
cmp_relation(EBX, EAX) # a - b
end
# a: <on the stack>
# b: eax
#
# if a >= b then !(a < b)
def ge_relation
# Compare them as in less than but invert the result.
cmp_relation(EBX, EAX, :invert => true)
end
# a: <on the stack>
# b: eax
#
# if a <= b then !(a > b)
def le_relation
# Compare them as in greater than but invert the result.
cmp_relation(EAX, EBX, :invert => true)
end
######################################
# statements and controls structures #
######################################
def keyword
unless action = @keywords[@value]
raise "unsupported keyword: #{@value}"
end
send(action)
end
# Parse an assignment statement. Value is in eax.
def assignment
name = @value
match('=')
boolean_expression
lval = asm.var!(name)
asm.store_var(lval, return_reg)
end
# Parse a code block.
def block
@indent += 1
# scan a token, type ends up in @token and value in @value
scan
until @value == 'else' || @value == 'end' || eof?
if @token == :keyword
keyword
else
assignment
end
scan
end
@indent -= 1
end
# Parse an if-else statement.
def if_else_stmt
else_label = asm.make_label(:end_or_else)
end_label = else_label # only generated if else clause
# present
condition
skip_any_whitespace
asm.jz(else_label)
block
if @token == :keyword && @value == 'else'
skip_any_whitespace
end_label = asm.make_label(:endif) # now we need the 2nd label
asm.jmp(end_label)
asm.define_label(else_label)
block
end
match_word('end')
asm.define_label(end_label)
end
# Used to implement the Two-Label-Loops (while, until, repeat).
#
# name: Name of the loop for readable labels.
# block: Code to execute at the start of each iteration. (e.g. a
# condition)
def simple_loop(name)
start_label = asm.make_label(:"#{name}_loop")
end_label = asm.make_label(:"end_#{name}")
asm.define_label(start_label)
yield(end_label)
pushing_label(end_label) { block }
match_word('end')
asm.jmp(start_label)
asm.define_label(end_label)
end
def condition_loop(name, jump_instruction)
simple_loop(name) do |end_label|
condition
skip_any_whitespace
asm.send(jump_instruction, end_label)
end
end
def while_stmt
condition_loop('while', :jz) # done when == 0 (falsish)
end
def until_stmt
condition_loop('until', :jnz) # done when != 0 (truthy)
end
def repeat_stmt
simple_loop('repeat') do |end_label|
skip_any_whitespace
end
end
# s = 0
# f x = 1 to 5
# s = s + x
# e
def for_stmt
name = get_name
counter = asm.define_var(name)
match('=')
boolean_expression # initial value
asm.sub(return_reg, 1) # pre-decrement because of the
# following pre-increment
asm.mov([counter], EAX) # stash the counter in memory
match_word('to', :scan => true)
boolean_expression # final value
skip_any_whitespace
asm.push(EAX) # stash final value on stack
final = [ESP]
simple_loop('for') do |end_label|
asm.mov(ECX, [counter]) # get the counter
asm.add(ECX, 1) # increment
asm.mov([counter], ECX) # store the counter
asm.cmp(final, ECX) # check if we're done
asm.jz(end_label) # if so jump to the end
end
asm.add(ESP, 4) # clean up the stack
end
# do 5
# ...
# end
def do_stmt
boolean_expression
skip_any_whitespace
asm.mov(ECX, EAX)
start_label = asm.make_label(:do)
end_label = asm.make_label(:enddo)
asm.define_label(start_label)
asm.push(ECX)
pushing_label(end_label) { block }
asm.pop(ECX)
match_word('end')
asm.dec(ECX)
asm.jnz(start_label)
# Phony push! break needs to clean up the stack, but since we
# don't know if there is a break at this point we fake a push and
# always clean up the stack after.
asm.sub(ESP, 4)
asm.define_label(end_label)
# If there was a break we have to clean up the stack here. If
# there was no break we clean up the phony push above.
asm.add(ESP, 4)
end
def break_stmt
if top_label
asm.jmp(top_label)
else
expected(:'break to be somewhere useful',
:got => :'a break outside a loop')
end
end
# Evaluates any expression for now. There are no boolean operators.
def condition
boolean_expression
skip_whitespace
asm.cmp(EAX, 0) # 0 is false, anything else is true
end
# print eax in hex format
def print_stmt
# variables
d = '__DIGITS'
h = '__HEX'
digits = if asm.var?(d)
asm.var(d)
else
d_var = asm.define_var(d, 16)
asm.block do
# define a lookup table of digits
mov([d_var], 0x33323130)
mov([d_var+4], 0x37363534)
mov([d_var+8], 0x62613938)
mov([d_var+12], 0x66656463)
end
d_var
end
# 12 bytes: 2 for "0x", 8 hex digits, 2 for newline + null terminator
hex = asm.var!(h, 12)
asm.block do
# TODO check sign and prepend '-' if negative
mov([hex], 0x7830) # "0x" ==> 0x30 (48), 0x78 (120)
mov([hex+4], 0) # zero the rest
mov([hex+8], 0)
mov([:byte, hex+10], 0xa) # newline
mov([:byte, hex+11], 0) # null terminator
end
boolean_expression # result in EAX
asm.block do
# convert eax to a hex string
lea(ESI, [digits])
lea(EDI, [hex+9])
# build the string backwards (right to left), byte by byte
mov(ECX, 4)
end
asm.block do
define_label(loop_label = make_label)
# low nybble of nth byte
movzx(EBX, AL)
and_(BL, 0x0f) # isolate low nybble
movzx(EDX, [:byte, ESI+EBX])
mov([EDI], DL)
dec(EDI)
# high nybble of nth byte
movzx(EBX, AL)
and_(BL, 0xf0) # isolate high nybble
shr(BL, 4)
mov(DL, [ESI+EBX])
mov([EDI], DL)
dec(EDI)
shr(EAX, 8)
loop_(loop_label)
# write(int fd, char *s, int n)
mov(EAX, 4) # SYS_write
lea(ECX, [hex]) # ecx = &s
args = [1, # fd = 1 (STDOUT)
ECX, # s = &s
11] # n = 11 (excluding term, max # of chars to print)
case platform
when 'darwin' # on the stack, right to left (right @ highest addr)
####
# setup bogus stack frame
push(EBP)
mov(EBP, ESP)
sub(ESP, 36)
####
args.reverse.each { |a| push(a) }
push(EAX)
int(0x80)
####
# teardown bogus stack frame
xor(EAX, EAX)
add(ESP, 36)
pop(EBX)
leave
####
when 'linux'
mov(EBX, args[0])
mov(ECX, args[1])
mov(EDX, args[2])
int(0x80)
end
end
end
############
# internal #
############
def eof?
@input.eof? && @look.nil?
end
def op_char?(char, kind = :all)
OP_CHARS[kind].include?(char)
end
def op?(kind, token)
OPS[kind].include?(token)
end
# Read the next character from the input stream.
def get_char
@look = @input.readbyte.chr unless @input.eof?
end
# Report what was expected
def expected(what, options = {})
got = options.has_key?(:got) ? options[:got] : @value
got, what = *[got, what].map {|x| x.is_a?(Symbol) ? x : "'#{x}'" }
if eof?
raise ParseError.new(caller), "Premature end of file, expected: #{what}."
else
context = (@input.readline rescue '(EOF)').gsub("\n", "\\n")
raise ParseError.new(caller, context), "Expected #{what} but got #{got}."
end
end
# Recognize an alphabetical character.
def alpha?(char)
('A'..'Z') === char.upcase
end
# Recognize a decimal digit.
def digit?(char)
('0'..'9') === char
end
# Recognize an alphanumeric character.
def alnum?(char)
alpha?(char) || digit?(char) || char == '_'
end
# XXX disabled! ... should treat true/false as constants
# once again we need a token of lookahead
def boolean?(char)
#char == 't' || char == 'f'
false
end
def whitespace?(char)
char == ' ' || char == "\t"
end
def newline?(char)
char == "\n" || char == "\r"
end
def comment_char?(char)
char == '#'
end
def any_whitespace?(char)
whitespace?(char) || newline?(char)
end
# Parse one or more newlines.
def get_newline
expected(:newline, :got => @look) unless newline?(@look)
many(:newline?)
@token = :newline
@value = "\n"
end
# Match literal input.
def match(char)
expected(char, :got => @look) unless @look == char
# puts "[ch] #{indent}#{char}"
get_char
skip_whitespace
end
# Match literal input.
def match_word(word, options = {})
scan if options[:scan]
match = @value == word
expected(word) unless match
match
end
# Parse zero or more consecutive characters for which the test is
# true.
def many(test)
test = method(test) if test.is_a?(Symbol)
token = ''
while !eof? && test[@look]
token << @look
get_char
end
skip_whitespace
token
end
# Parse a "name" (keyword or identifier).
def get_name
expected(:identifier) unless alpha?(@look)
@value = many(:alnum?)
@token = @keyword_names.include?(@value) ? :keyword : :identifier
@value
end
# Parse a number.
def get_number
expected(:integer) unless digit?(@look)
@token = :number
@value = many(:digit?)
# puts "[nu] #{indent}#{@value} (0x#{@value.to_i.to_s(16)})"
@value
end
def get_boolean
get_name
expected(:boolean) unless @value == 'true' || @value == 'false'
@token = :boolean
# puts "[bo] #{indent}#{@value}"
@value
end
def get_op
expected(:operator) unless op_char?(@look)
@token = :op
@value = many(:op_char?)
end
# Skip leading whitespace.
def skip_whitespace
get_char while whitespace?(@look)
end
# Skip leading whitespace including newlines.
def skip_any_whitespace
get_char while any_whitespace?(@look)
end
def skip_comment
get_char until newline?(@look)
skip_any_whitespace
end
def indent
real_indent = if @value == 'else' || @value == 'end'
@indent - 1
else
@indent
end
' ' * (real_indent * 4)
end
def pushing(reg)
asm.push(reg)
yield
asm.add(ESP, 4)
end
def print_token
print(case @token
when :keyword
'[kw] '
when :number
'[nu] '
when :identifier
'[id] '
when :op
'[op] '
when :boolean
'[bo] '
when :newline
''
else
raise "print doesn't know about #{@token}: #{@value}"
end)
print indent
puts @value
end
def pushing_label(label)
push_label(label)
yield
pop_label
end
def push_label(label)
@label_stack.push(label)
end
def top_label
@label_stack[-1]
end
def pop_label
@label_stack.pop
end
# for debugging
def self.hook(callback, methods)
methods.each do |m|
orig = :"orig_#{m}"
alias_method orig, m
define_method(m) do
val = send(orig)
send(callback)
val
end
end
end
# hook(:print_token, [:get_name, :get_newline, :get_number, :get_op, :get_boolean])
end
end