[NEW] Binary assembler outputs working machine code and Mach-O object files that can be linked into working executables.

This commit is contained in:
Sami Samhuri 2009-06-25 09:42:56 -07:00
parent 47ce9043e4
commit bc6a3d4d3b
25 changed files with 2082 additions and 894 deletions

View file

@ -1,17 +1,4 @@
test:
cd test && make all
elfwriter: elfwriter.c
gcc -o elfwriter elfwriter.c -lelf
test_elf: elfwriter build
./elfwriter test.bin 4 test_elf.o
ld -o test_elf test_elf.o
./test_elf
clean:
@rm -f elfwriter
@rm -f test_elf.o
@rm -f test_elf
.PHONY: test

View file

@ -5,6 +5,8 @@
# sjs
# may 2009
require 'asm/registers'
module Assembler
# Abstract class for common functionality between different code
@ -14,41 +16,14 @@ module Assembler
attr_reader :platform
def initialize(platform='linux', *args)
def initialize(platform)
@platform = platform
@vars = {} # Symbol table, maps names to locations in BSS.
@num_labels = 0 # Used to generate unique labels.
@num_labels_with_suffix = Hash.new(0)
# Maps names to locations.
@labels = Hash.new {|h, key| raise "undefined label: #{key}"}
end
def block(*args, &blk)
instance_eval(&blk)
end
def output
raise "#{self.class} is supposed to implement this method!"
end
def var(name)
@vars[name]
end
alias_method :var?, :var
# Generate a unique label.
def label(suffix=nil)
@num_labels += 1
if suffix
@num_labels_with_suffix[suffix] += 1
suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
end
name = "L#{sprintf "%06d", @num_labels}#{suffix}"
return name
end
end
end

File diff suppressed because it is too large Load diff

320
asm/cstruct.rb Normal file
View file

@ -0,0 +1,320 @@
# Struct does some trickery with custom allocators so we can't subclass it without writing C.
# Instead we define a CStruct class that does something similar enough for our purpose. It is
# subclassed just like any other class. A nice side-effect of this syntax is that it is always
# clear that a CStruct is just a class and instances of the struct are objects.
#
# Some light metaprogramming is used to make the following syntax possible:
#
# class MachHeader < CStruct
# uint :magic
# int :cputype
# int :cpusubtype
# ...
# int :flags
# end
#
# Inheritance works as you would expect.
#
# class LoadCommand < CStruct
# uint32 :cmd
# uint32 :cmdsize
# end
#
# # inherits cmd and cmdsize as the first 2 fields
# class SegmentCommand < LoadCommand
# string :segname, 16
# uint32 :vmaddr
# uint32
# end
#
# Nothing tricky or confusing there. Members of a CStruct class are declared in the
# class definition. A different definition using a more static approach probably wouldn't
# be very hard... if performance is critical ... but then why are you using Ruby? ;-)
class CStruct
###################
# Class Constants #
###################
# Size in bytes.
SizeMap = {
:int8 => 1,
:uint8 => 1,
:int16 => 2,
:uint16 => 2,
:int32 => 4,
:uint32 => 4,
:string => lambda { |*opts| opts.first }, # first opt is size
# the last 3 are to make the language more C-like
:int => 4,
:uint => 4,
:char => 1
}
# 32-bit
PackMap = {
:int8 => 'c',
:uint8 => 'C',
:int16 => 's',
:uint16 => 'S',
:int32 => 'i',
:uint32 => 'I',
:string => lambda do |str, *opts|
len = opts.first
str.ljust(len, "\0")[0, len]
end,
# a few C-like names
:int => 'i',
:uint => 'I',
:char => 'C'
}
# Only needed when unpacking is different from packing, i.e. strings w/ lambdas in PackMap.
UnpackMap = {
:string => lambda do |str, *opts|
len = opts.first
val = str[0, len-1].sub(/\0*$/, '')
str.slice!((len-1)..-1)
val
end
}
##########################
# Class Instance Methods #
##########################
# Note: const_get and const_set are used so the constants are bound at runtime, to the
# real class that has subclassed CStruct. I figured Ruby would do this but I haven't
# looked at the implementation of constants so it might be tricky.
#
# All of this could probably be avoided with Ruby 1.9 and private class variables.
# That is definitely something to experiment with.
class <<self
def inherited(subclass)
subclass.instance_eval do
# These "constants" are only constant references. Structs can be modified.
# After the struct is defined it is still open, but good practice would be not
# to change a struct after it has been defined.
#
# To support inheritance properly we try to get these constants from the enclosing
# scope (and clone them before modifying them!), and default to empty, er, defaults.
members = const_get(:Members).clone rescue []
member_index = const_get(:MemberIndex).clone rescue {}
member_sizes = const_get(:MemberSizes).clone rescue {}
member_opts = const_get(:MemberOptions).clone rescue {}
const_set(:Members, members)
const_set(:MemberIndex, member_index)
const_set(:MemberSizes, member_sizes)
const_set(:MemberOptions, member_opts)
end
end
# Define a method for each size name, and when that method is called it updates
# the struct class accordingly.
SizeMap.keys.each do |type|
define_method(type) do |name, *args|
name = name.to_sym
const_get(:MemberIndex)[name] = const_get(:Members).size
const_get(:MemberSizes)[name] = type
const_get(:MemberOptions)[name] = args
const_get(:Members) << name
end
end
# Return the number of members.
def size
const_get(:Members).size
end
alias_method :length, :size
# Return the number of bytes occupied in memory or on disk.
def bytesize
const_get(:Members).inject(0) { |size, name| size + sizeof(name) }
end
def sizeof(name)
value = SizeMap[const_get(:MemberSizes)[name]]
value.respond_to?(:call) ? value.call(*const_get(:MemberOptions)[name]) : value
end
def new_from_bin(bin)
new_struct = new
new_struct.unserialize(bin)
end
end
####################
# Instance Methods #
####################
attr_reader :values
def initialize(*args)
@values = args
end
def serialize
vals = @values.clone
membs = members.clone
pack_pattern.map do |patt|
name = membs.shift
if patt.is_a?(String)
[vals.shift].pack(patt)
else
patt.call(vals.shift, *member_options[name])
end
end.join
end
def unserialize(bin)
bin = bin.clone
@values = []
membs = members.clone
unpack_pattern.each do |patt|
name = membs.shift
if patt.is_a?(String)
@values += bin.unpack(patt)
bin.slice!(0, sizeof(name))
else
@values << patt.call(bin, *member_options[name])
end
end
self
end
def pack_pattern
members.map { |name| PackMap[member_sizes[name]] }
end
def unpack_pattern
members.map { |name| UnpackMap[member_sizes[name]] || PackMap[member_sizes[name]] }
end
def [](name_or_idx)
case name_or_idx
when Numeric
idx = name_or_idx
@values[idx]
when String, Symbol
name = name_or_idx.to_sym
@values[member_index[name]]
else
raise ArgumentError, "expected name or index, got #{name_or_idx.inspect}"
end
end
def []=(name_or_idx, value)
case name_or_idx
when Numeric
idx = name_or_idx
@values[idx] = value
when String, Symbol
name = name_or_idx.to_sym
@values[member_index[name]] = value
else
raise ArgumentError, "expected name or index, got #{name_or_idx.inspect}"
end
end
def ==(other)
puts @values.inspect
puts other.values.inspect
other.is_a?(self.class) && other.values == @values
end
# Some of these are just to quack like Ruby's built-in Struct. YAGNI, but can't hurt either.
def each(&block)
@values.each(&block)
end
def each_pair(&block)
members.zip(@values).each(&block)
end
def size
members.size
end
alias_method :length, :size
def sizeof(name)
self.class.sizeof(name)
end
def bytesize
self.class.bytesize
end
alias_method :to_a, :values
# A few convenience methods.
def members
self.class::Members
end
def member_index
self.class::MemberIndex
end
def member_sizes
self.class::MemberSizes
end
def member_options
self.class::MemberOptions
end
# The last expression is returned, so return self instead of junk.
self
end
# a small test
if $0 == __FILE__
class MachHeader < CStruct
uint :magic
int :cputype
int :cpusubtype
string :segname, 16
end
puts MachHeader::Members.inspect
puts MachHeader::MemberIndex.inspect
puts MachHeader::MemberSizes.inspect
puts "# of MachHeader members: " + MachHeader.size.to_s + ", size in bytes: " + MachHeader.bytesize.to_s
mh = MachHeader.new(0xfeedface, 7, 3, "foobar")
puts "magic(#{MachHeader.sizeof(:magic)}): " + mh[:magic].inspect
puts "cputype(#{MachHeader.sizeof(:cputype)}): " + mh[:cputype].inspect
puts "cpusubtype(#{MachHeader.sizeof(:cpusubtype)}): " + mh[:cpusubtype].inspect
puts "segname(#{MachHeader.sizeof(:segname)}): " + mh[:segname].inspect
puts mh.pack_pattern.inspect
binstr = mh.serialize
puts "values: " + mh.values.inspect
newmh = MachHeader.new_from_bin(binstr)
puts "new values: " + newmh.values.inspect
newbinstr = newmh.serialize
puts "serialized: " + binstr.inspect
puts "unserialized: " + newbinstr.inspect
puts "new == old ? " + (newbinstr == binstr).to_s
end

7
asm/elfsymtab.rb Normal file
View file

@ -0,0 +1,7 @@
module Assembler
class ELFSymtab < Symtab
end
end

9
asm/elfwriter.rb Normal file
View file

@ -0,0 +1,9 @@
module Assembler
class ELFWriter < ObjWriter
end
end

164
asm/macho.rb Normal file
View file

@ -0,0 +1,164 @@
require 'asm/cstruct'
# The MachO module contains constants and structures related to the
# Mach Object format (Mach-O). They are relevant to Darwin on OS X.
#
# Constants and structures as defined in /usr/include/mach-o/loader.h on
# Mac OS X Leopard (10.5.7). Also see <mach-o/stab.h> and <mach-o/nlist.h>.
module MachO
###############
# Mach header #
###############
# Appears at the beginning of every Mach object file.
class MachHeader < CStruct
uint32 :magic
int32 :cputype
int32 :cpusubtype
uint32 :filetype
uint32 :ncmds
uint32 :sizeofcmds
uint32 :flags
end
# Values for the magic field.
MH_MAGIC = 0xfeedface # Mach magic number.
MH_CIGAM = 0xcefaedfe # In the reverse byte-order.
# Values for the filetype field.
MH_OBJECT = 0x1
MH_EXECUTE = 0x2
MH_FVMLIB = 0x3
MH_CORE = 0x4
MH_PRELOAD = 0x5
MH_DYLIB = 0x6
MH_DYLINKER = 0x7
MH_BUNDLE = 0x8
MH_DYLIB_STUB = 0x9
MH_DSYM = 0xa
# CPU types and subtypes (only Intel for now).
CPU_TYPE_X86 = 7
CPU_TYPE_I386 = CPU_TYPE_X86
CPU_SUBTYPE_X86_ALL = 3
############################
# Load commands / segments #
############################
class LoadCommand < CStruct
uint32 :cmd
uint32 :cmdsize
end
# Values for the cmd member of LoadCommand CStructs (incomplete!).
LC_SEGMENT = 0x1
LC_SYMTAB = 0x2
LC_SYMSEG = 0x3
LC_THREAD = 0x4
LC_UNIXTHREAD = 0x5
class SegmentCommand < LoadCommand
string :segname, 16
uint32 :vmaddr
uint32 :vmsize
uint32 :fileoff
uint32 :filesize
int32 :maxprot
int32 :initprot
uint32 :nsects
uint32 :flags
end
# Values for protection fields, maxprot and initprot.
VM_PROT_NONE = 0x00
VM_PROT_READ = 0x01
VM_PROT_WRITE = 0x02
VM_PROT_EXECUTE = 0x04
VM_PROT_NO_CHANGE = 0x08
VM_PROT_COPY = 0x10
class SymtabCommand < LoadCommand
uint32 :symoff # Points to an array of Nlist structs.
uint32 :nsyms # Number of entries in said array.
uint32 :stroff # Offset of the string table.
uint32 :strsize # Size of the string table in bytes.
end
LoadCommandStructMap = {
LC_SEGMENT => SegmentCommand,
LC_SYMTAB => SymtabCommand
}
############
# Sections #
############
class Section < CStruct
string :sectname, 16
string :segname, 16
uint32 :addr
uint32 :size
uint32 :offset
uint32 :align
uint32 :reloff
uint32 :nreloc
uint32 :flags
uint32 :reserved1
uint32 :reserved2
end
# Values for the type bitfield (mask 0x000000ff) of the flags field.
# (incomplete!)
S_REGULAR = 0x0
S_ZEROFILL = 0x1
S_CSTRING_LITERALS = 0x2
########################
# Symbol table support #
########################
# Nlist is used to describe symbols.
class Nlist < CStruct
uint32 :n_strx # Index into string table. Index of zero is the empty string.
uint8 :n_type # Type flag (see below).
uint8 :n_sect # Section number (from 1) or NO_SECT.
uint16 :n_desc # TODO See <mach-o/stab.h>.
uint32 :n_value # The symbol's value (or stab offset).
end
# Type flag (see <mach-o/nlist.h> for more details)
# ---------
#
# This field consists of four bitfields:
#
# uchar N_STAB : 3
# uchar N_PEXT : 1
# uchar N_TYPE : 3
# uchar N_EXT : 1
#
N_STAB = 0xe0 # if any bits set => symbolic debugging info
N_PEXT = 0x10 # private external symbol bit
N_TYPE = 0x0e # mask for the type bits
N_EXT = 0x01 # external symbol bit, set for external symbols (e.g. globals)
# Values for N_TYPE. (incomplete!)
N_UNDF = 0x0 # undefined, n_sect == NO_SECT
N_ABS = 0x2 # absolute, n_sect == NO_SECT
N_SECT = 0xe # defined in section number n_sect
NO_SECT = 0
MAX_SECT = 255
end

364
asm/machofile.rb Normal file
View file

@ -0,0 +1,364 @@
require 'asm/macho'
module Assembler
class MachOFile
include MachO
attr_accessor :header, :load_commands, :sections, :data
attr_accessor :current_segment
attr_accessor :text_offset
def initialize(filetype=MH_OBJECT)
@header = MachHeader.new(MH_MAGIC, CPU_TYPE_X86, CPU_SUBTYPE_X86_ALL, filetype, 0, 0, 0)
@load_commands = [] # All defined segments.
@sections = {} # Map of segment names to lists of segments.
@section_disk_size = Hash.new(0) # Sections store their VM size so we need their sizes on disk.
@data = [] # Blobs of data that appear at the end of the file.
# (text, data, symtab, ...)
@current_segment = nil # An alias for the last defined segment.
# Leave room for __PAGEZERO, a single 0x1000 (4kb) page at 0x0. The
# __TEXT segment starts at 0x1000 and contains mach headers and load
# commands.
@text_offset = 0x1000
end
# Define a LoadCommand in this file. The header's ncmds and sizeofcmds
# fields are updated automatically to keep things in sync. If a block is
# given it is passed the new LoadCommand struct after all other
# initialization has been done.
#
# Other methods that create any type of load command should use this
# method to do so. Right now the only types supported are LC_SEGMENT
# and LC_SYMTAB. Modify asm/macho.rb to add structs for other types, and
# add them to LoadCommandStructMap.
def load_command(cmdtype)
struct = LoadCommandStructMap[cmdtype]
unless struct
raise "unsupported load command type: #{cmdtype.inspect}," +
" supported types: #{LoadCommandStructMap.keys.sort.inspect}"
end
# Fill in all the unknown fields with 0, this is nonsense for
# string fields but that doesn't really matter.
dummy_vals = [0] * (struct::Members.size - 2)
# cmd cmdsize ...
command = struct.new(cmdtype, struct.bytesize, *dummy_vals)
@load_commands << command
@header[:ncmds] += 1
@header[:sizeofcmds] += command.bytesize
yield(command) if block_given?
return command
end
# Define a segment in this file. If a block is given it is passed
# the new segment. You can chain calls to segment, it returns self.
#
# Mach object files should only contain one anonymous segment. This
# is not checked but should be kept in mind when crafting files.
def segment(name, &block)
@current_segment = load_command(LC_SEGMENT) do |seg|
seg[:segname] = name
block.call(seg) if block
end
return self
end
# Define a section under the given segment. nsects and cmdsize are
# updated automatically. segname can't be derived from the segment
# that this section is defined under, as they can differ.
#
# Mach object files have the __text, __data, and other common
# sections all defined under one anonymous segment, but their segment
# names reflect their final positions after linking. The linker plonks
# them in the segment that they name.
def section(name, segname, data='', vmsize=data.size,
segment=@current_segment, type=S_REGULAR)
# Create the new section.
section = Section.new(name, segname, 0, vmsize, 0, 0, 0, 0, 0, 0, type)
# Add this section to the map of segment names to sections.
(@sections[segment[:segname]] ||= []) << section
@section_disk_size[name] = data.size
@data << data if data.size > 0
# Update the header.
@header[:sizeofcmds] += section.bytesize
# Update the segment.
segment[:nsects] += 1
segment[:cmdsize] += section.bytesize
yield(section) if block_given?
return section
end
# Define a standard text section under the current segment (if present).
#
# If there is no current segment then we act according to the file's type
# (specified in the header). Segments are created if they do not exist.
#
# When it is MH_OBJECT the text section is defined under a single,
# nameless segment, but the section's segment name is set to the name
# given here.
#
# For MH_EXECUTE files the text section goes under the segment with the
# name given (__TEXT).
def text(data, sectname='__text', segname='__TEXT')
unless @current_segment
segment(segname_based_on_filetype(segname)) do |seg|
seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE
seg[:initprot] = VM_PROT_READ | VM_PROT_EXECUTE
end
end
section(sectname, segname, data) do |sect|
sect[:flags] = 0x400 # S_ATTR_SOME_INSTRUCTIONS
end
return self
end
# Define a standard data section under the current segment (if present).
# This behaves similarly to the text method.
#
def data(data, sectname='__data', segname='__DATA')
unless @current_segment
segment(segname_based_on_filetype(segname)) do |seg|
seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE
seg[:initprot] = VM_PROT_READ | VM_PROT_WRITE
end
end
section(sectname, segname, data)
return self
end
# Define a standard const section under the current segment (if present).
# This behaves similarly to the data method.
#
def const(data, sectname='__const', segname='__DATA')
unless @current_segment
segment(segname_based_on_filetype(segname)) do |seg|
seg[:maxprot] = VM_PROT_READ
seg[:initprot] = VM_PROT_READ
end
end
section(sectname, segname, data)
return self
end
# Define a standard BSS section under the current segment (if present).
# This behaves similarly to the data method but accepts a VM size instead
# of a blob, and no data is written to file since this section is for
# uninitialized data.
#
def bss(vmsize, sectname='__bss', segname='__DATA')
unless @current_segment
segment(segname_based_on_filetype(segname)) do |seg|
seg[:maxprot] = VM_PROT_READ | VM_PROT_WRITE
seg[:initprot] = VM_PROT_READ | VM_PROT_WRITE
end
end
section(sectname, segname, '', vmsize)
return self
end
# Define a symbol table. This should usually be placed at the end of the
# file.
#
# This function is overloaded to accept either an array of Nlist structs
# packed into a byte string (i.e. a C array) and a string table, or a
# single parameter: any type of Symtab.
def symtab(nlist_ary_or_symtab, stab=nil)
if stab.nil?
symtab = nlist_ary_or_symtab
stab = symtab.stab
nlist_ary = symtab.nlist_ary
else
nlist_ary = nlist_ary_or_symtab
end
load_command(LC_SYMTAB) do |st|
st[:nsyms] = nlist_ary.size
st[:strsize] = stab.size
# symoff and stroff are filled in when offsets are recalculated.
end
# puts ">>> Defining symbol table:"
# puts ">>> #{nlist_ary.size} symbols"
# puts ">>> stab = #{stab.inspect}"
# puts ">>> nlist_ary = #{nlist_ary.inspect}"
# puts ">>> (serialized) = #{nlist_ary.map{|n|n.serialize}.join.inspect}"
@data << nlist_ary.map {|n| n.serialize}.join
@data << stab
return self
end
# Serialize the entire MachO file into a byte string. This is simple
# thanks to CStruct#serialize.
def serialize
# TODO sanity checks, e.g. assert(@header[:ncmds] == @load_command.size)
# ... perhaps an option to recalculate such data as well.
recalculate_offsets
# |------------------|
# | Mach Header |
# |------------------|
# | Segment 1 |
# | Section 1 | ---
# | Section 2 | --|--
# | ... | | |
# | Segment 2 | | |
# | Section 4 | | |
# | Section 5 | | |
# | ... | | |
# | ... | | |
# | [Symtab cmd] | | |
# |------------------| | |
# | Section data 1 | <-- |
# | Section data 2 | <----
# | ... |
# | [Symtab data] |
# |------------------|
# dump the mach header
obj = @header.serialize
# dump each load command (which include the section headers under them)
obj += @load_commands.map do |cmd|
sects = @sections[cmd[:segname]] rescue []
sects.inject(cmd.serialize) do |data, sect|
data + sect.serialize
end
end.join
# and finally dump the blobs at the end
obj += @data.join
return obj
end
# Update the file offsets in SegmentCommand, SymtabCommand, and Section structs.
def recalculate_offsets
# Maintain the offset into the the file. This is used to update
# the various structures.
offset = @header.bytesize
# First pass over load commands. Most sizes are filled in here.
@load_commands.each do |cmd|
case cmd[:cmd]
when LC_SEGMENT
seg = cmd
sections = @sections[seg[:segname]]
section_size = sections.size * Section.bytesize
section_vm_size = sections.inject(0) { |total, sect| total + sect[:size] }
section_disk_size = sections.inject(0) do |total, sect|
total + @section_disk_size[sect[:sectname]]
end
### TODO this should be redundant. try commenting it out one day.
seg[:nsects] = sections.size
seg[:cmdsize] = seg.bytesize + section_size
###
seg[:vmsize] = section_vm_size
seg[:filesize] = section_disk_size
when LC_SYMTAB
# nop
else
raise "unsupported load command: #{cmd.inspect}"
end
offset += cmd[:cmdsize]
end
# offset now points to the end of the Mach-O headers, or the beginning
# of the binary blobs of section data at the end.
# Second pass over load commands. Fill in file offsets.
@load_commands.each do |cmd|
case cmd[:cmd]\
when LC_SEGMENT
seg = cmd
sections = @sections[seg[:segname]]
seg[:fileoff] = offset
sections.each do |sect|
sect[:offset] = offset
offset += @section_disk_size[sect[:sectname]]
end
when LC_SYMTAB
st = cmd
st[:symoff] = offset
offset += st[:nsyms] * Nlist.bytesize
st[:stroff] = offset
offset += st[:strsize]
# No else clause is necessary, the first iteration should have caught them.
end
end # @load_commands.each
end # def recalculate_offsets
#######
private
#######
def segname_based_on_filetype(segname)
case @header[:filetype]
when MH_OBJECT: ''
when MH_EXECUTE: segname
else
raise "unsupported MachO file type! #{@header.inspect}"
end
end
end # class MachOFile
end # module Assembler

29
asm/machosym.rb Normal file
View file

@ -0,0 +1,29 @@
require 'asm/macho'
module Assembler
class MachOSym
attr_accessor :name, :type, :segnum, :desc, :value
def initialize(name, type, segnum, desc, value)
@name = name
@type = type
@segnum = segnum
@desc = desc
@value = value
end
def to_nlist(strx)
MachO::Nlist.new(strx, @type, @segnum, @desc, @value)
end
def to_s
@name
end
end
end

77
asm/machosymtab.rb Normal file
View file

@ -0,0 +1,77 @@
require 'asm/macho'
require 'asm/machosym'
require 'asm/symtab'
module Assembler
class MachOSymtab < Symtab
include MachO
def const_offset
return 0x2000
end
def bss_offset
# TODO figure out how to calculate these, or how to let the linker do it!
# ... relocation tables perhaps?
return 0x2800
end
def all_symbols
symbols = []
# Functions (section #1, __text)
#
# All labels are exported. This should be changed and only functions exported!
# TODO fixme ...
#
# Note: Sorting a Ruby hash gives an alist, e.g. [[<key>, <value>], ...]
# We can use map on it as if it were a hash so it works nicely.
#
symbols +=
@labels.sort { |a,b| a[1] <=> b[1] }.
map do |name,addr|
MachOSym.new(name, N_SECT | N_EXT, 1, 0, addr)
end
# Constants (section #2, __const)
symbols += @consts.sort { |a,b| a[1] <=> b[1] }.
map do |name, addr|
MachOSym.new(name, N_SECT, 2, 0, addr)
end
# Variables (section #3, __bss)
#
# TODO FIXME the last var exported ends up after main somewhere... WTF?!
symbols += @vars.sort { |a,b| a[1] <=> b[1] }.
map do |name, addr|
MachOSym.new(name, N_SECT, 3, 0, addr)
end
return symbols
end
def nlist_ary
symbols = {}
strx = 1
ary = []
all_symbols.each do |sym|
key = sym.name.to_sym
unless symbols.has_key?(key)
symbols[key] = strx
strx += sym.name.length + 1 # +1 for the null byte
end
ary << sym.to_nlist(symbols[key])
end
return ary
end
def stab
# The empty strings result in a string that begins and ends with
['', all_symbols, ''].flatten.map { |sym| sym.to_s }.join("\0")
end
end
end

26
asm/machowriter.rb Normal file
View file

@ -0,0 +1,26 @@
### XXX development hack!
def stub_symtab!
text_segnum = 1
symtab_stub = {
:functions => [
# name type segnum addr
['_main', N_SECT | N_EXT, text_segunm, 0x0]
]
}
nlist_ary = []
stab = "\0"
strx = 1 # string index (1-based)
symtab[:functions].each do |name, type, segnum, addr|
nlist_ary << MachO::Nlist.new(strx, type, segnum, 0, addr)
stab << "#{name}\0"
strx += 1
end
symtab(nlist_ary, stab)
end
end
end

26
asm/objwriter.rb Normal file
View file

@ -0,0 +1,26 @@
module Assembler
class UnimplementedMethodError < RuntimeError; end
# Abstract base class.
class ObjWriter
def write!(filename)
File.open(filename, 'wb') do |file|
file.print(serialize)
end
end
def fail(name)
raise UnimplementedMethodError, name
end
# These methods must be defined for most uses of the library.
%w[header segment section text data bss symtab serialize].each do |name|
define_method(name) { fail(name) }
end
end
end

32
asm/registers.rb Normal file
View file

@ -0,0 +1,32 @@
require 'asm/regproxy'
module Assembler
module Registers
# This structure allows for x86 registers of all sizes. The
# number of the register is the index of the array in which it was
# found. The size of a register in bytes is 2 ** index-into-sub-array.
Registers = [ [:al, :ax, :eax], # 0
[:cl, :cx, :ecx], # 1
[:dl, :dx, :edx], # 2
[:bl, :bx, :ebx], # 3
[:ah, :sp, :esp], # 4
[:ch, :bp, :ebp], # 5
[:dh, :si, :esi], # 6
[:bh, :di, :edi] # 7
]
# Setup register proxies which are used both in effective address
# calculations, and also just as symbols representing registers.
Registers.each_with_index do |group, regnum|
group.each_with_index do |reg, i|
name = reg.to_s.upcase
const_set(name, RegisterProxy.new(reg, 8 * (2 ** i), regnum))
end
end
end
end

67
asm/regproxy.rb Normal file
View file

@ -0,0 +1,67 @@
module Assembler
# Acts like a register and can be used as the base or index in an
# effective address.
#
# e.g. [EAX] or [ESI+EBX] or [EAX + 0xff] or [EAX + EDX * 2]
class RegisterProxy
attr_reader :name, :size, :regnum
attr_reader :base, :index, :scale
def initialize(name, size, regnum)
@name = name # attrs are read-only so sharing is ok
@size = size
@regnum = regnum
@base = self
end
def +(index)
raise "index already specified" if @index
new_reg = self.clone
new_reg.instance_variable_set('@index', index)
new_reg
end
def *(scale)
raise "index must come first" unless @index
raise "scale already specified" if scale
raise "unsupported scale: #{scale}" unless scale.to_s.match(/^[1248]$/)
@scale = scale
self
end
def scale?
@scale
end
def index?
@index
end
def register?
@scale.nil? && @index.nil?
end
def to_s
@name.to_s +
(@index ? "+#{@index}" : '') +
(@scale ? "*#{@scale}" : '')
end
def inspect
to_s
end
end
end

89
asm/symtab.rb Normal file
View file

@ -0,0 +1,89 @@
module Assembler
class Symtab
attr_reader :const_data, :bss_size
def initialize
@vars = {} # Map of variable names to addresses. (bss vars)
@consts = {} # Map of constant names to addresses.
@funcs = {} # map of function names to addresses.
# Initial data to load into memory (data for __DATA segment).
@const_data = ''
@const_size = 0 # Size of const section.
@bss_size = 0 # Size of bss section.
# Map names to locations.
@labels = Hash.new {|h, key| raise "undefined label: #{key}"}
@num_labels = 0 # Used to generate unique labels.
@num_labels_with_suffix = Hash.new(0)
end
####
## NB: Concrete subclasses must define methods named:
## bss_offset, and const_offset
####
# Generate a unique label.
def unique_label(suffix=nil)
@num_labels += 1
if suffix
@num_labels_with_suffix[suffix] += 1
suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
end
name = "L#{sprintf "%06d", @num_labels}#{suffix}"
return name
end
def deflabel(name, addr)
@labels[name] = addr
return name
end
def lookup_label(name)
@labels[name]
end
def defvar(name, bytes)
@vars[name] = @bss_size
@bss_size += bytes
end
def defconst(name, value, bytes)
@consts[name] = @const_size
@const_size += bytes
@const_data << [value].pack('i')
end
def defun(name, addr)
@funcs[name] = addr
end
def var(name)
bss_offset + @vars[name]
end
def var?(name)
@vars[name]
end
def const(name)
const_offset + @consts[name]
end
def const?(name)
@consts[name]
end
end
end

View file

@ -3,9 +3,6 @@
# sjs
# may 2009
ROOT = __FILE__.sub(/\/asm\/text\.rb$/, '') unless defined? ROOT
$LOAD_PATH << ROOT unless $LOAD_PATH.include?(ROOT)
require 'asm/asm'
module Assembler
@ -15,8 +12,9 @@ module Assembler
# correct machine code, which isn't trivial.
class Text < AssemblerBase
def initialize(platform='linux')
def initialize(platform)
super
@vars = {} # Symbol table, maps names to locations in BSS.
@data = ''
@bss = ''
@code = ''
@ -39,6 +37,13 @@ module Assembler
end
end
def var(name)
@vars[name]
end
alias_method :var?, :var
# Emit a line of code wrapped between a tab and a newline.
def emit(code, options={})
tab = options.has_key?(:tab) ? options[:tab] : "\t"
@ -106,6 +111,10 @@ module Assembler
emit("call #{label}")
end
def leave
emit("leave")
end
def neg(reg)
emit("neg #{reg}")
end
@ -166,5 +175,9 @@ module Assembler
emit("int 0x#{num.to_s(16)}")
end
def cdq
emit("cdq")
end
end
end

View file

@ -3,10 +3,21 @@
require 'compiler'
require 'asm/text'
require 'asm/binary'
require 'asm/machosymtab'
require 'asm/machofile'
# usage: build.rb <filename> [elf | macho ] [asm | bin]
def main
filename = ARGV[0].to_s
raise "can't read #{filename}" unless File.readable?(filename)
binformat = ARGV[1] ? ARGV[1].downcase : 'elf'
format = ARGV[2] ? ARGV[2].downcase : 'asm'
platform = `uname -s`.chomp.downcase
puts "Building #{format} from #{filename} for #{platform}, binformat is #{binformat} ..."
outfile = build(filename, platform, format, binformat)
puts outfile
exit
end
@ -18,22 +29,17 @@ def base(filename)
end
# filename: input filename
# infile: input filename
# outfile: output filename
# asm: assembler to use
# returns: output filename
def compile(filename, asm)
def compile(infile, outfile, asm)
File.open(filename, 'r') do |input|
compiler = Compiler.new(input, asm)
compiler.compile
end
ext = asm.class.name.split('::').last[0,3].downcase == 'bin' ? 'bin' : 'asm'
outfile = "#{base(filename)}.#{ext}"
File.open(infile, 'r') do |input|
File.open(outfile, 'wb') do |out|
out.puts(asm.output)
compiler = Compiler.new(input, asm)
out.print(compiler.compile)
end
end
return outfile
rescue ParseError => e
error("[error] #{e.message}")
@ -44,12 +50,13 @@ rescue ParseError => e
end
# assemble using nasm, return resulting filename.
def asm(filename, binformat='elf')
def assemble(filename, binformat='elf')
f = base(filename)
outfile = "#{f}.o"
output = `nasm -f #{binformat} -g -o #{outfile} #{filename}`
output = `nasm -f #{binformat} -g -o #{outfile} #{filename} 2>&1`
if $?.exitstatus != 0
puts output
puts
print output
raise "nasm failed: #{$?.exitstatus}"
end
return outfile
@ -64,32 +71,41 @@ def link(filename, platform='linux')
else
raise "unsupported platform: #{platform}"
end
output = `#{cmd} #{args} -o #{f} #{filename}`
output = `#{cmd} #{args} -o #{f} #{filename} 2>&1`
if $?.exitstatus != 0
puts output
puts
print output
raise "ld failed: #{$?.exitstatus}"
end
`chmod +x #{f}`
`chmod u+x #{f}`
return f
end
# TODO Use a dependency injection framework for the assembler, and
# other parts as things become more modular.
def build(filename, platform='linux', format='asm', binformat='elf')
bin = if format == 'asm'
code = compile(filename, Assembler::Text.new(platform))
obj = asm( code, binformat )
link( obj, platform )
else # binary
obj = compile(filename, Assembler::Binary.new(platform))
link( obj, platform )
def build(filename, platform='linux', binformat='elf')
objfile = base(filename) + '.o'
symtab, objwriter =
case binformat
when 'elf': [Assembler::ELFSymtab.new, Assembler::ELFFile.new]
when 'macho': [Assembler::MachOSymtab.new, Assembler::MachOFile.new]
else
raise "unsupported binary format: #{binformat}"
end
return bin
compile(filename, objfile, Assembler::Binary.new(platform, symtab, objwriter))
exefile = link(objfile, platform)
return exefile
end
def build_asm(filename, platform='linux', binformat='elf')
asmfile = base(filename) + '.asm'
compile(filename, asmfile, Assembler::Text.new(platform))
objfile = assemble(asmfile, binformat)
exefile = link(objfile, platform)
return exefile
end
def run(filename)
filename = "./#{filename}" unless filename.include?('/')
system(filename)
`#{filename}`
return $?.exitstatus
end

View file

@ -12,6 +12,8 @@
# require 'rubygems'
# require 'unroller'
require 'asm/registers'
class ParseError < StandardError
attr_reader :caller, :context
def initialize(caller, context=nil)
@ -22,6 +24,8 @@ end
class Compiler
include Assembler::Registers
Keywords = %w[
if else end while until repeat for to do break
print
@ -82,7 +86,7 @@ class Compiler
asm.call(name)
else
# variable access
asm.mov(:eax, "dword [#{name}]")
asm.mov(EAX, [asm.var(name)])
end
end
@ -95,7 +99,7 @@ class Compiler
elsif alpha?(@look)
identifier # or call
elsif digit?(@look)
asm.mov(:eax, get_number.to_i)
asm.mov(EAX, get_number.to_i)
else
expected(:'integer, identifier, function call, or parenthesized expression', :got => @look)
end
@ -106,7 +110,7 @@ class Compiler
sign = @look
match(sign) if op?(:unary, sign)
factor
asm.neg(:eax) if sign == '-'
asm.neg(EAX) if sign == '-'
end
# Parse and translate a single term (factor or mulop). Result is in
@ -115,14 +119,13 @@ class Compiler
signed_factor # Result in eax.
while op?(:mul, @look)
pushing(:eax) do
asm.push(EAX)
case @look
when '*': multiply
when '/': divide
end
end
end
end
# Parse and translate a general expression of terms. Result is
# in eax.
@ -130,21 +133,21 @@ class Compiler
term # Result is in eax.
while op_char?(@look, :add)
pushing(:eax) do
asm.push(EAX)
case @look
when '+': add
when '-': subtract
end
end
end
end
# Parse an addition operator and the 2nd term (b). The result is
# left in eax. The 1st term (a) is expected on the stack.
def add
match('+')
term # Result is in eax.
asm.add(:eax, '[esp]') # Add a to b.
asm.pop(EBX)
asm.add(EAX, EBX) # Add a to b.
end
# Parse a subtraction operator and the 2nd term (b). The result is
@ -152,8 +155,9 @@ class Compiler
def subtract
match('-')
term # Result, b, is in eax.
asm.neg(:eax) # Fake the subtraction. a - b == a + -b
asm.add(:eax, '[esp]') # Add a and -b.
asm.pop(EBX)
asm.neg(EAX) # Fake the subtraction. a - b == a + -b
asm.add(EAX, EBX) # Add a(ebx) to -b(eax).
end
# Parse an addition operator and the 2nd term (b). The result is
@ -161,7 +165,8 @@ class Compiler
def multiply
match('*')
signed_factor # Result is in eax.
asm.imul('dword [esp]') # Multiply a by b.
asm.pop(EBX)
asm.imul(EBX) # Multiply a by b.
end
# Parse a division operator and the divisor (b). The result is
@ -169,14 +174,15 @@ class Compiler
def divide
match('/')
signed_factor # Result is in eax.
asm.xchg(:eax, '[esp]') # Swap the divisor and dividend into
asm.pop(EBX)
asm.xchg(EAX, EBX) # Swap the divisor and dividend into
# the correct places.
# idiv uses edx:eax as the dividend so we need to ensure that edx
# is correctly sign-extended w.r.t. eax.
asm.cdq # Sign-extend eax into edx (Convert Double to
# Quad).
asm.idiv('dword [esp]') # Divide a (eax) by b ([esp]).
asm.idiv(EBX) # Divide a (eax) by b (ebx).
end
@ -187,19 +193,22 @@ class Compiler
def bitor_expr
match('|')
term
asm.or(:eax, '[esp]')
asm.pop(EBX)
asm.or_(EAX, EBX)
end
def bitand_expr
match('&')
signed_factor
asm.and_(:eax, '[esp]')
asm.pop(EBX)
asm.and_(EAX, EBX)
end
def xor_expr
match('^')
term
asm.xor(:eax, '[esp]')
asm.pop(EBX)
asm.xor(EAX, EBX)
end
@ -232,9 +241,9 @@ class Compiler
def boolean_factor
if boolean?(@look)
if get_boolean == 'true'
asm.mov(:eax, -1)
asm.mov(EAX, -1)
else
asm.xor(:eax, :eax)
asm.xor(EAX, EAX)
end
scan
else
@ -246,8 +255,8 @@ class Compiler
if @look == '!'
match('!')
boolean_factor
make_boolean(:eax) # ensure it is -1 or 0...
asm.not(:eax) # so that not is also boolean not
make_boolean(EAX) # ensure it is -1 or 0...
asm.not_(EAX) # so that 1's complement NOT is also boolean not
else
boolean_factor
end
@ -255,8 +264,8 @@ class Compiler
# Convert any identifier to a boolean (-1 or 0). This is
# semantically equivalent to !!reg in C or Ruby.
def make_boolean(reg=:eax)
end_label = asm.label(:endmakebool)
def make_boolean(reg=EAX)
end_label = asm.mklabel(:endmakebool)
asm.cmp(reg, 0) # if false do nothing
asm.jz(end_label)
asm.mov(reg, -1) # truthy, make it true
@ -267,7 +276,7 @@ class Compiler
expression
if op_char?(@look, :rel)
scan
pushing(:eax) do
asm.push(EAX)
case @value
when '==': eq_relation
when '!=': neq_relation
@ -278,9 +287,8 @@ class Compiler
end
end
end
end
# a: [esp]
# a: <on the stack>
# b: eax
#
# If b - a is zero then a = b, and make_boolean will leave the zero
@ -288,14 +296,15 @@ class Compiler
# and make_boolean will leave -1 (true) for us in eax.
def neq_relation
expression
asm.sub(:eax, '[esp]')
asm.pop(EBX)
asm.sub(EAX, EBX)
make_boolean
end
# Invert the != test for equal.
def eq_relation
neq_relation
asm.not(:eax)
asm.not_(EAX)
end
# > and < are both implemented in terms of jl (jump if less than).
@ -303,6 +312,12 @@ class Compiler
# and order the terms appropriately for each function. As for >=
# and <=, they in turn are implemented in terms of > and <. a is
# greater than or equal to b if and only if a is *not* less than b.
#
# Note: This was done to minimize the number of instructions that
# the assembler needed to implement, but since the Jcc
# instructions are very cheap to implement this is no longer
# a concern.
# The next 4 relations all compare 2 values a and b, then return
# true (-1) if the difference was below zero and false (0)
@ -311,58 +326,62 @@ class Compiler
# Invert the sense of the test?
invert = options[:invert]
true_label = asm.label(:cmp)
end_label = asm.label(:endcmp)
true_label = asm.mklabel(:cmp)
end_label = asm.mklabel(:endcmp)
asm.cmp(a, b)
asm.jl(true_label)
asm.xor(:eax, :eax) # return false
asm.not(:eax) if invert # (or true if inverted)
asm.xor(EAX, EAX) # return false
asm.not_(EAX) if invert # (or true if inverted)
asm.jmp(end_label)
asm.emit_label(true_label)
asm.xor(:eax, :eax) # return true
asm.not(:eax) unless invert # (or false if inverted)
asm.xor(EAX, EAX) # return true
asm.not_(EAX) unless invert # (or false if inverted)
asm.emit_label(end_label)
end
# a: [esp]
# a: <on the stack>
# b: eax
#
# if a > b then b - a < 0
def gt_relation
expression
cmp_relation(:eax, '[esp]') # b - a
asm.pop(EBX)
cmp_relation(EAX, EBX) # b - a
end
# a: [esp]
# a: <on the stack>
# b: eax
#
# if a < b then a - b < 0
def lt_relation
expression
cmp_relation('[esp]', :eax) # a - b
asm.pop(EBX)
cmp_relation(EBX, EAX) # a - b
end
# a: [esp]
# a: <on the stack>
# b: eax
#
# if a >= b then !(a < b)
def ge_relation
expression
asm.pop(EBX)
# Compare them as in less than but invert the result.
cmp_relation('[esp]', :eax, :invert => true)
cmp_relation(EBX, EAX, :invert => true)
end
# a: [esp]
# a: <on the stack>
# b: eax
#
# if a <= b then !(a > b)
def le_relation
expression
asm.pop(EBX)
# Compare them as in greater than but invert the result.
cmp_relation(:eax, '[esp]', :invert => true)
cmp_relation(EAX, EBX, :invert => true)
end
@ -376,7 +395,7 @@ class Compiler
match('=')
boolean_expression
asm.defvar(name) unless asm.var?(name)
asm.mov("dword [#{name}]", :eax)
asm.mov([asm.var(name)], EAX)
end
# Parse a code block.
@ -413,7 +432,7 @@ class Compiler
# Parse an if-else statement.
def if_else_stmt(label)
else_label = asm.label(:end_or_else)
else_label = asm.mklabel(:end_or_else)
end_label = else_label # only generated if else clause
# present
condition
@ -424,7 +443,7 @@ class Compiler
@indent -= 1
if @token == :keyword && @value == 'else'
skip_any_whitespace
end_label = asm.label(:endif) # now we need the 2nd label
end_label = asm.mklabel(:endif) # now we need the 2nd label
asm.jmp(end_label)
asm.emit_label(else_label)
@indent += 1
@ -441,8 +460,8 @@ class Compiler
# block: Code to execute at the start of each iteration. (e.g. a
# condition)
def simple_loop(name)
start_label = asm.label(:"loop_#{name}")
end_label = asm.label(:"end_#{name}")
start_label = asm.mklabel(:"#{name}_loop")
end_label = asm.mklabel(:"end_#{name}")
asm.emit_label(start_label)
yield(end_label)
@ -482,27 +501,29 @@ class Compiler
# s = s + x
# e
def for_stmt
counter = "[#{get_name}]"
counter = get_name
asm.defvar(counter)
match('=')
boolean_expression # initial value
asm.sub(:eax, 1) # pre-decrement because of the
asm.sub(EAX, 1) # pre-decrement because of the
# following pre-increment
asm.mov(counter, :eax) # stash the counter in memory
asm.mov([asm.var(counter)], EAX) # stash the counter in memory
match_word('to', :scan => true)
boolean_expression # final value
skip_any_whitespace
asm.push(:eax) # stash final value on stack
final = '[esp]'
asm.push(EAX) # stash final value on stack
asm.mov(EDX, ESP)
final = [EDX]
simple_loop('for') do |end_label|
asm.mov(:ecx, counter) # get the counter
asm.add(:ecx, 1) # increment
asm.mov(counter, :ecx) # store the counter
asm.cmp(final, :ecx) # check if we're done
asm.mov(ECX, [asm.var(counter)]) # get the counter
asm.add(ECX, 1) # increment
asm.mov([asm.var(counter)], ECX) # store the counter
asm.cmp(final, ECX) # check if we're done
asm.jz(end_label) # if so jump to the end
end
asm.add(:esp, 4) # clean up the stack
asm.add(ESP, 4) # clean up the stack
end
# do 5
@ -512,19 +533,19 @@ class Compiler
boolean_expression
skip_any_whitespace
asm.mov(:ecx, :eax)
asm.mov(ECX, EAX)
start_label = asm.label(:do)
end_label = asm.label(:enddo)
start_label = asm.mklabel(:do)
end_label = asm.mklabel(:enddo)
asm.emit_label(start_label)
asm.push(:ecx)
asm.push(ECX)
@indent += 1
block(end_label)
@indent -= 1
asm.pop(:ecx)
asm.pop(ECX)
match_word('end')
asm.loop_(start_label)
@ -532,13 +553,13 @@ class Compiler
# Phony push! break needs to clean up the stack, but since we
# don't know if there is a break at this point we fake a push and
# always clean up the stack after.
asm.sub(:esp, 4)
asm.sub(ESP, 4)
asm.emit_label(end_label)
# If there was a break we have to clean up the stack here. If
# there was no break we clean up the phony push above.
asm.add(:esp, 4)
asm.add(ESP, 4)
end
def break_stmt(label)
@ -554,79 +575,83 @@ class Compiler
def condition
boolean_expression
skip_whitespace
asm.cmp(:eax, 0) # 0 is false, anything else is true
asm.cmp(EAX, 0) # 0 is false, anything else is true
end
# print eax in hex format
def print_stmt
# variable names
d = 'DIGITS'
h = 'HEX'
asm.block do
# define a lookup table of digits
unless var?('DIGITS')
defvar('DIGITS', 4)
mov('dword [DIGITS]', 0x33323130)
mov('dword [DIGITS+4]', 0x37363534)
mov('dword [DIGITS+8]', 0x62613938)
mov('dword [DIGITS+12]', 0x66656463)
unless var?(d)
defvar(d, 4)
mov([var(d)], 0x33323130)
mov([var(d)+4], 0x37363534)
mov([var(d)+8], 0x62613938)
mov([var(d)+12], 0x66656463)
end
# 3 dwords == 12 chars
defvar('HEX', 3) unless var?('HEX')
defvar(h, 3) unless var?(h)
# TODO check sign and prepend '-' if negative
mov('word [HEX]', 0x7830) # "0x" == [48, 120]
mov('word [HEX+10]', 0xa) # newline + null terminator
mov([var(h)], 0x7830) # "0x" == [48, 120]
mov([var(h)+10], 0xa) # newline + null terminator
end
boolean_expression
asm.block do
# convert eax to a hex string
lea(:esi, '[DIGITS]')
lea(:edi, '[HEX+9]')
lea(ESI, [var(d)])
lea(EDI, [var(h)+9])
# build the string backwards (right to left), byte by byte
mov(:ecx, 4)
mov(ECX, 4)
end
asm.emit_label(loop_label=asm.label)
asm.emit_label(loop_label=asm.mklabel)
asm.block do
# low nybble of nth byte
movzx(:ebx, :al)
and_(:bl, 0x0f) # isolate low nybble
movzx(:edx, 'byte [esi+ebx]')
mov('byte [edi]', :dl)
dec(:edi)
movzx(EBX, AL)
and_(BL, 0x0f) # isolate low nybble
movzx(EDX, [:byte, ESI+EBX])
mov([EDI], DL)
dec(EDI)
# high nybble of nth byte
movzx(:ebx, :al)
and_(:bl, 0xf0) # isolate high nybble
shr(:bl, 4)
mov(:dl, 'byte [esi+ebx]')
mov('byte [edi]', :dl)
dec(:edi)
shr(:eax, 8)
movzx(EBX, AL)
and_(BL, 0xf0) # isolate high nybble
shr(BL, 4)
mov(DL, [ESI+EBX])
mov([EDI], DL)
dec(EDI)
shr(EAX, 8)
loop_(loop_label)
# write(int fd, char *s, int n)
mov(:eax, 4) # SYS_write
lea(:ecx, '[HEX]') # ecx = &s
mov(EAX, 4) # SYS_write
lea(ECX, [var(h)]) # ecx = &s
args = [1, # fd = 1 (STDOUT)
:ecx, # s = &s
ECX, # s = &s
11] # n = 11 (excluding term, max # of chars to print)
case platform
when 'darwin' # on the stack, right to left (right @ highest addr)
####
# setup bogus stack frame
push(:ebp)
mov(:ebp, :esp)
sub(:esp, 36)
push(EBP)
mov(EBP, ESP)
sub(ESP, 36)
####
args.reverse.each { |a| push(a) }
push(:eax)
push(EAX)
int(0x80)
####
# teardown bogus stack frame
xor(:eax, :eax)
add(:esp, 36)
pop(:ebx)
emit("leave")
xor(EAX, EAX)
add(ESP, 36)
pop(EBX)
leave
####
when 'linux'
mov(:ebx, args[0])
mov(:ecx, args[1])
mov(:edx, args[2])
mov(EBX, args[0])
mov(ECX, args[1])
mov(EDX, args[2])
int(0x80)
end
end
@ -819,15 +844,15 @@ class Compiler
def pushing(reg)
asm.push(reg)
yield
asm.add(:esp, 4)
asm.add(ESP, 4)
end
def op(name)
pushing(:eax) do
asm.push(EAX)
get_op
expected(name) unless match_word(name)
yield
end
asm.add(ESP, 4)
end

View file

@ -1,288 +0,0 @@
#include <libelf.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
/* _exit(0) */
/* uint8_t shell_code[] = { */
/* 0xbb, 0, 0, 0, 0, /\* mov ebx, 0 *\/ */
/* 0xb8, 1, 0, 0, 0, /\* mov eax, 1 *\/ */
/* 0xcd, 0x80 /\* int 0x80 *\/ */
/* }; */
/* uint32_t hash_words[] = { */
/* 0x12345678, */
/* 0xdeadc0de, */
/* 0x1234abcd */
/* }; */
#define header_size 0x100
#define text_addr 0x8048000 + header_size
#define text_size 0x02be00
#define data_addr text_addr + text_size
#define data_size 0x4e00
#define bss_addr data_addr + data_size
size_t bss_size = 0;
char string_table[] = {
/* Offset 0 */ '\0',
/* Offset 1 */ '.', 't', 'e', 'x', 't', '\0' ,
/* Offset 7 */ '.', 'b', 's', 's', '\0',
/* Offset 12 */ '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', '\0'
};
/* Write a static 32-bit x86 ELF binary to filename. The file is
* clobbered without confirmation!
*/
int
elf_write(const char *filename, uint8_t *code, size_t code_size)
{
int fd;
size_t shstrndx;
Elf *elf;
Elf_Scn *scn;
Elf_Data *data;
Elf32_Ehdr *ehdr;
Elf32_Phdr *phdr;
Elf32_Shdr *shdr;
if (elf_version(EV_CURRENT) == EV_NONE) {
printf("Failed to initialize ELF library!\n");
return -1;
}
if ((fd = open(filename, O_RDWR|O_TRUNC|O_CREAT, 0666)) < 0) {
printf("Can't open %s for writing.\n", filename);
perror("[elf_write]");
return -2;
}
if ((elf = elf_begin(fd, ELF_C_WRITE, (Elf *)0)) == 0) {
printf("elf_begin failed!\n");
return -3;
}
/**************
* ELF Header *
**************/
if ((ehdr = elf32_newehdr(elf)) == NULL) {
printf("elf32_newehdr failed!\n");
return -4;
}
ehdr->e_ident[EI_DATA] = ELFDATA2LSB; /* 2's complement, little endian */
ehdr->e_type = ET_EXEC;
ehdr->e_machine = EM_386; /* x86 */
/* Image starts at 0x8048000, x86 32-bit abi. We need a bit
* of room for headers and such. TODO figure out how much
* room is needed!
*
* Current entry point is .text section.
*/
ehdr->e_entry = text_addr;
/*******************
* Program Headers *
*******************/
/* 3 segments => 3 program headers (text, data, bss) */
if ((phdr = elf32_newphdr(elf, 3)) == NULL) {
printf("elf32_newphdr failed!\n");
return -5;
}
/*****************
* .text section *
*****************/
if ((scn = elf_newscn(elf)) == NULL) {
printf("elf_newscn failed!\n");
return -6;
}
if ((data = elf_newdata(scn)) == NULL) {
printf("elf_newdata failed!\n");
return -7;
}
data->d_align = 16;
data->d_buf = code;
data->d_off = 0LL;
data->d_type = ELF_T_BYTE;
data->d_size = code_size;
data->d_version = EV_CURRENT;
if ((shdr = elf32_getshdr(scn)) == NULL) {
printf("elf32_getshdr failed!\n");
return -8;
}
shdr->sh_name = 1;
shdr->sh_type = SHT_PROGBITS;
shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
shdr->sh_addr = text_addr;
/****************
* .bss section *
****************/
if ((scn = elf_newscn(elf)) == NULL) {
printf("elf_newscn failed!\n");
return -6;
}
if ((data = elf_newdata(scn)) == NULL) {
printf("elf_newdata failed!\n");
return -7;
}
data->d_align = 4;
data->d_off = 0LL;
data->d_type = ELF_T_BYTE;
data->d_size = bss_size;
data->d_version = EV_CURRENT;
if ((shdr = elf32_getshdr(scn)) == NULL) {
printf("elf32_getshdr failed!\n");
return -8;
}
shdr->sh_name = 7;
shdr->sh_type = SHT_NOBITS;
shdr->sh_flags = SHF_WRITE | SHF_ALLOC;
shdr->sh_addr = bss_addr;
/*******************************
* section header string table *
*******************************/
if ((scn = elf_newscn(elf)) == NULL) {
printf("elf_newscn failed!\n");
return -9;
}
if ((data = elf_newdata(scn)) == NULL) {
printf("elf_newdata failed!\n");
return -10;
}
data->d_align = 1;
data->d_buf = string_table;
data->d_off = 0LL;
data->d_type = ELF_T_BYTE;
data->d_size = sizeof(string_table);
data->d_version = EV_CURRENT;
if ((shdr = elf32_getshdr(scn)) == NULL) {
printf("elf32_getshdr failed!\n");
return -11;
}
shdr->sh_name = 12;
shdr->sh_type = SHT_STRTAB;
shdr->sh_flags = SHF_STRINGS | SHF_ALLOC;
shdr->sh_entsize = 0;
/* int elf_setshstrndx(Elf *e, Elf32_Ehdr *eh, size_t shstrndx) */
shstrndx = elf_ndxscn(scn);
if (shstrndx >= SHN_LORESERVE) {
if ((scn = elf_getscn(elf, 0)) == NULL) {
printf("elf_getscn failed!\n");
return -12;
}
/* assert(scn->s_ndx == SHN_UNDEF); */
/* scn->s_shdr.s_shdr32.sh_link = shstrndx; */
elf_flagshdr(scn, ELF_C_SET, ELF_F_DIRTY);
shstrndx = SHN_XINDEX;
}
ehdr->e_shstrndx = shstrndx;
if (elf_update(elf, ELF_C_NULL) < 0) {
printf("elf_update failed!\n");
return -12;
}
/* phdr->p_vaddr = phdr->p_paddr = 0x8048000 + ehdr->e_phoff; */
/* phdr->p_type = PT_PHDR; */
/* phdr->p_offset = ehdr->e_phoff; */
/* phdr->p_filesz = elf32_fsize(ELF_T_PHDR, 1, EV_CURRENT); */
/* text segment */
phdr->p_vaddr = text_addr;
phdr->p_type = PT_LOAD;
phdr->p_offset = header_size;
phdr->p_filesz = text_size;
phdr->p_memsz = text_size;
phdr->p_flags = PF_R | PF_X;
phdr->p_align = 0x1000;
/* data segment */
phdr++;
phdr->p_vaddr = data_addr;
phdr->p_type = PT_LOAD;
phdr->p_offset = header_size + text_size;
phdr->p_filesz = data_size;
phdr->p_memsz = data_size + 0x1024; /* XXX unsure why the abi specifies + 0x1024 */
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_align = 0x1000;
/* bss segment */
phdr++;
phdr->p_vaddr = bss_addr;
phdr->p_type = PT_LOAD;
phdr->p_offset = header_size + text_size + data_size;
phdr->p_filesz = bss_size;
phdr->p_memsz = bss_size;
phdr->p_flags = PF_R | PF_W;
phdr->p_align = 0x1000;
elf_flagphdr(elf, ELF_C_SET, ELF_F_DIRTY);
if (elf_update(elf, ELF_C_WRITE) < 0) {
printf("elf_update failed!\n");
return -13;
}
elf_end(elf);
close(fd);
return 0;
}
int
main(int argc, const char *argv[])
{
int result;
pid_t pid;
FILE *fd;
uint8_t *code = NULL;
size_t code_size = 0, chunk_size = 1024, bytes_read;
if (argc < 4) {
printf("usage: %s <input> <bss_size> <output>\n", argv[0]);
printf(" Wraps the input file in an ELF binary.\n");
return 1;
}
bss_size = strtoul(argv[2], 0, 10);
if ((fd = fopen(argv[1], "r")) < 0) {
printf("[error] can't open %s for reading.\n", argv[1]);
perror("[main]");
return 2;
}
while (!feof(fd) && !ferror(fd)) {
code = realloc(code, code_size + chunk_size);
bytes_read = fread(code+code_size, 1, chunk_size, fd);
code_size += bytes_read;
}
fclose(fd);
printf("Writing x86 ELF binary to %s...\n", argv[3]);
result = elf_write(argv[3], code, code_size);
if (result < 0) {
printf("[error] elf_write failed.\n");
return 3;
}
return 0;
}

12
lea.asm
View file

@ -1,12 +0,0 @@
BITS 32
lea eax, [ebx+ecx*4]
lea ebx, [eax+ecx*4]
lea eax, [ecx+ebx*4]
lea eax, [ecx+ebx*8]
lea eax, [ecx+ebx]
lea eax, [0x1000+10*4]
lea eax, [eax]
lea eax, [ecx]
lea ecx, [eax]
lea eax, [0xdeadbeef]

89
mov.asm
View file

@ -1,89 +0,0 @@
BITS 32
;;; 00000000 b8 78 56 34 12 b9 78 56 34 12 ba 78 56 34 12 bb |.xV4..xV4..xV4..|
;;; 00000010 78 56 34 12 89 c0 89 c8 89 d0 89 d8 89 c1 89 c9 |xV4.............|
;;; 00000020 89 d1 89 d9 89 c2 89 ca 89 d2 89 da 89 c3 89 cb |................|
;;; 00000030 89 d3 89 db a1 ef be ad de 8b 0d ef be ad de 8b |................|
;;; 00000040 15 ef be ad de 8b 1d ef be ad de a3 ef be ad de |................|
;;; 00000050 89 0d ef be ad de 89 15 ef be ad de 89 1d ef be |................|
;;; 00000060 ad de 8b 00 8b 01 8b 02 8b 03 8b 08 8b 09 8b 0a |................|
;;; 00000070 8b 0b 8b 10 8b 11 8b 12 8b 13 8b 18 8b 19 8b 1a |................|
;;; 00000080 8b 1b 89 00 89 01 89 02 89 03 89 08 89 09 89 0a |................|
;;; 00000090 89 0b 89 10 89 11 89 12 89 13 89 18 89 19 89 1a |................|
;;; 000000a0 89 1b |..|
;;; 000000a2
mov eax, 0x12345678 ; b8 78 56 34 12
mov ecx, 0x12345678 ; b9 78 56 34 12
mov edx, 0x12345678 ; ba 78 56 34 12
mov ebx, 0x12345678 ; bb 78 56 34 12
mov eax, eax ; 89 c0
mov eax, ecx ; 89 c8
mov eax, edx ; 89 d0
mov eax, ebx ; 89 d8
mov ecx, eax ; 89 c1
mov ecx, ecx ; 89 c9
mov ecx, edx ; 89 d1
mov ecx, ebx ; 89 d9
mov edx, eax ; 89 c2
mov edx, ecx ; 89 ca
mov edx, edx ; 89 d2
mov edx, ebx ; 89 da
mov ebx, eax ; 89 c3
mov ebx, ecx ; 89 cb
mov ebx, edx ; 89 d3
mov ebx, ebx ; 89 db
mov eax, dword [0xdeadbeef] ; a1 ef be ad de
mov ecx, dword [0xdeadbeef] ; 8b 0e ef be ad de
mov edx, dword [0xdeadbeef] ; 8b 16 ef be ad de
mov ebx, dword [0xdeadbeef] ; 8b 1e ef be ad de
mov [0xdeadbeef], eax ; a3 ef be ad de
mov [0xdeadbeef], ecx ; 89 0e ef be ad de
mov [0xdeadbeef], edx ; 89 16 ef be ad de
mov [0xdeadbeef], ebx ; 89 1e ef be ad de
mov eax, dword [eax] ; 8b 00
mov eax, dword [ecx] ; 8b 01
mov eax, dword [edx] ; 8b 02
mov eax, dword [ebx] ; 8b 03
mov ecx, dword [eax] ; 8b 08
mov ecx, dword [ecx] ; 8b 09
mov ecx, dword [edx] ; 8b 0a
mov ecx, dword [ebx] ; 8b 0b
mov edx, dword [eax] ; 8b 10
mov edx, dword [ecx] ; 8b 11
mov edx, dword [edx] ; 8b 12
mov edx, dword [ebx] ; 8b 13
mov ebx, dword [eax] ; 8b 18
mov ebx, dword [ecx] ; 8b 19
mov ebx, dword [edx] ; 8b 1a
mov ebx, dword [ebx] ; 8b 1b
mov [eax], eax ; 89 00
mov [ecx], eax ; 89 01
mov [edx], eax ; 89 02
mov [ebx], eax ; 89 03
mov [eax], ecx ; 89 08
mov [ecx], ecx ; 89 09
mov [edx], ecx ; 89 0a
mov [ebx], ecx ; 89 0b
mov [eax], edx ; 89 10
mov [ecx], edx ; 89 11
mov [edx], edx ; 89 12
mov [ebx], edx ; 89 13
mov [eax], ebx ; 89 18
mov [ecx], ebx ; 89 19
mov [edx], ebx ; 89 1a
mov [ebx], ebx ; 89 1b

View file

@ -56,7 +56,7 @@ break: test.rb test_break.code
print: test.rb test_print.code
@./test.rb print $(BINFORMAT)
big_test: test.rb big_test.code
big_test: test.rb test_big.code
@./test.rb big $(BINFORMAT)
clean:

View file

@ -5,20 +5,20 @@ $LOAD_PATH << ROOT
require 'build'
# usage: build.rb <func> [binformat]
#
# ([format] will go before [binformat])
# usage: test.rb <func> [binformat] [format]
def main
func = ARGV[0].to_s
format = 'asm' # 'bin' only assembles one or two
# instructions right now, but support
# is in place
binformat = (ARGV[1] ? ARGV[1] : 'elf').downcase
binformat = ARGV[1] ? ARGV[1].downcase : 'elf'
format = ARGV[2] ? ARGV[2].downcase : 'asm'
platform = `uname -s`.chomp.downcase
print "testing #{func} ... "
success = run( build("test_#{func}.code", platform, format, binformat) )
puts success == 0 ? "pass" : "FAIL! (#{success})"
success = run( build("test_#{func}.code", platform, binformat) )
if success == 0
puts "pass"
else
puts "FAIL! (#{success})"
end
exit(success.to_i)
end

View file

@ -1,7 +1,5 @@
i=0
a=10
for i = 0 to 10
a=a-1
end
a=a

11
x86.txt
View file

@ -1,11 +0,0 @@
mov (0x66) {
reg32, reg32 (0x89) {
op2 - src
eax ecx edx ebx
op1 eax c0 c8 d0 d8
dest ecx c1 c9 d1 d9
edx c2 ca d2 da
ebx c3 cb d3 db
}
}