compiler/compiler.rb

# A compiler as described by Jack Crenshaw in his famous book "Let's
# Build a Compiler".  At least in the beginning, this code will
# closely reflect the Pascal code written by Jack.  Over time it may
# become more idiomatic, however this is an academic exercise.
#
# sjs
# may 2009

require 'opcode'
#require 'assembler'

class ParseError < StandardError
  attr_reader :caller, :context
  def initialize(caller, context=nil)
    @caller = caller
    @context = context
  end
end

class Compiler
#  include Assembler

  attr_reader :data, :bss, :code

  def initialize(input=STDIN)
    @look = ''                   # Next lookahead char.
    @input = input               # Stream to read from.
    @data = ''                   # Data section.
    @bss = ''                    # BSS section.
    @code = ''                   # Code section.
    @vars = {}                   # Symbol table, maps names to locations in BSS.
    @num_labels = 0              # Used to generate unique labels.
    @num_labels_with_suffix = Hash.new(0)
    @break_stack = []            # for breaking out of loops
    @binary = []                 # Byte array of machine code.
    @machine_code = ''           # Byte string of machine code.

    @header_size = 0x100                     # ELF, Linux, x86
    @text_offset = 0x08048000 + @header_size # Offset of text section in memory (Linux, x86).
    @text_size = 0x02be00                    # Size of text section.
    @data_offset = @text_offset + @text_size # Offset of data section.
    @data_size = 0x4e00                      # Size of data section.
    @bss_offset = @data_offset + @data_size  # Offset of bss section.
    @bss_size = 0                            # Size of bss section.

    # Labels for the assembler.  Maps names to locations.
    @labels = Hash.new {|h, key| raise "undefined label: #{key}"}

    # Dispatch table for keywords.
    @dispatch = {
      'b' => method(:break_stmt),        # break
      'e' => nil,                        # end
      'l' => nil,                        # else
      'i' => method(:if_else_stmt),      # if-else
      'r' => method(:repeat_stmt),       # repeat
      'u' => method(:until_stmt),        # until
      'w' => method(:while_stmt)         # while
    }

    # Reserved words (... constant?)
    @keywords = @dispatch.keys

    # seed the lexer
    get_char
  end

  def parse
    block
    expected(:'end of file') unless eof?
    compile
    [@data, @bss, @code, @machine_code]
  end


  # Parse and translate an identifier or function call.
  def identifier
    name = get_name

    if @look == '('
      # function call
      match('(')
      match(')')
      x86_call(name)
    else
      # variable access
      x86_mov(:eax, "dword [#{name}]")
    end
  end

  # Parse and translate a single factor.  Result is in eax.
  def factor
    if @look == '('
      match('(')
      expression
      match(')')
    elsif alpha?(@look)
      identifier
    elsif digit?(@look)
      x86_mov(:eax, get_num)
    else
      expected(:'integer, identifier, or parenthesized expression')
    end
  end

  # Parse and translate a single term (factor or mulop).  Result is in
  # eax.
  def term
    factor                      # Result in eax.
    while mulop?
      # Stash the 1st factor on the stack.  This is expected by
      # multiply & divide.  Because they leave their results in eax
      # associativity works.  Each interim result is pushed on the
      # stack here.
      x86_push(:eax)

      if @look == '*'
        multiply
      else
        divide
      end

      x86_add(:esp, 4)        # Remove the 1st factor from the stack.
    end
  end

  # Parse and translate a general expression of terms.  Result is
  # in eax.
  def expression
    if addop?
      # Clear eax simulating a zero before unary plus and minus
      # operations.
      x86_xor(:eax, :eax)
    else
      term                      # Result is in eax.
    end

    while addop?
      # Stash the 1st term on the stack.  This is expected by add &
      # subtract.  Because they leave their results in eax
      # associativity works.  Each interim result is pushed on the
      # stack here.
      x86_push(:eax)

      if @look == '+'
        add
      else
        subtract
      end

      x86_add(:esp, 4)        # Remove 1st term (a) from the stack.
    end
  end

  # Parse an assignment statement.  Value is in eax.
  def assignment
    name = get_name
    match('=')
    expression
    var(name)
    x86_mov("dword [#{name}]", :eax)
  end

  # Parse a statement.
  def statement
    if handler = @dispatch[@look]
      handler.call
    else
      assignment
      newline
    end
  end

  # Parse a code block.
  def block(label=nil)
    @break_stack.push(label) if label
    until @look == 'l' || @look == 'e' || eof?
      statement
      skip_any_whitespace
    end
    @break_stack.pop if label
  end

  # Parse an if-else statement.
  def if_else_stmt
    match('i')
    condition
    skip_any_whitespace
    else_label = unique_label(:end_or_else)
    end_label = else_label      # only generated if else clause present
    x86_jz(else_label)
    block
    if @look == 'l'
      match('l')
      skip_any_whitespace
      end_label = unique_label(:endif) # now we need the 2nd label
      x86_jmp(end_label)
      emit_label(else_label)
      block
    end
    match('e')
    emit_label(end_label)
  end

  def while_stmt
    match('w')
    while_label = unique_label(:while)
    end_label = unique_label(:endwhile)
    emit_label(while_label)
    condition
    skip_any_whitespace
    x86_jz(end_label)
    block(end_label)
    match('e')
    x86_jmp(while_label)
    emit_label(end_label)
  end

  def until_stmt
    match('u')
    until_label = unique_label(:until)
    end_label = unique_label(:enduntil)
    emit_label(until_label)
    condition
    skip_any_whitespace
    x86_jnz(end_label)
    block(end_label)
    match('e')
    x86_jmp(until_label)
    emit_label(end_label)
  end

  def repeat_stmt
    match('r')
    skip_any_whitespace         # no condition, slurp whitespace
    repeat_label = unique_label(:repeat)
    end_label = unique_label(:endrepeat)
    emit_label(repeat_label)
    block(end_label)
    match('e')
    x86_jmp(repeat_label)
    emit_label(end_label)
  end

  def break_stmt
    match('b')
    if @break_stack.empty?
      expected(:'break to be somewhere useful',
               :got => :'a break without a loop')
    end
    x86_jmp(@break_stack.last)
  end

  # Evaluates any expression for now.  There are no boolean operators.
  def condition
    expression
    x86_cmp(:eax, 0)            # 0 is false, anything else is true
    skip_whitespace
  end

  # Parse an addition operator and the 2nd term (b).  The result is
  # left in eax.  The 1st term (a) is expected on the stack.
  def add
    match('+')
    term                        # Result is in eax.
    x86_add(:eax, '[esp]')         # Add a to b.
  end

  # Parse a subtraction operator and the 2nd term (b).  The result is
  # left in eax.  The 1st term (a) is expected on the stack.
  def subtract
    match('-')
    term                      # Result, b, is in eax.
    x86_neg(:eax)             # Fake the subtraction.  a - b == a + -b
    x86_add(:eax, '[esp]')    # Add a and -b.
  end

  # Parse an addition operator and the 2nd term (b).  The result is
  # left in eax.  The 1st term (a) is expected on the stack.
  def multiply
    match('*')
    factor                      # Result is in eax.
    x86_imul('dword [esp]')     # Multiply a by b.
  end

  # Parse a division operator and the divisor (b).  The result is
  # left in eax.  The dividend (a) is expected on the stack.
  def divide
    match('/')
    factor                      # Result is in eax.
    x86_xchg(:eax, '[esp]')     # Swap the divisor and dividend into
                                # the correct places.

    # idiv uses edx:eax as the dividend so we need to ensure that edx
    # is correctly sign-extended w.r.t. eax.
    emit('cdq')       # Sign-extend eax into edx (Convert Double to
                      # Quad).
    x86_idiv('dword [esp]')     # Divide a (eax) by b ([esp]).
  end


############
# internal #
############


  def eof?
    @input.eof? && @look.nil?
  end

  def addop?
    @look == '+' || @look == '-'
  end

  def mulop?
    @look == '*' || @look == '/'
  end


  # Read the next character from the input stream.
  def get_char
    @look = if @input.eof?
              nil
            else
              @input.readbyte.chr
            end
  end

  # Report error and halt
  def abort(msg)
    raise ParseError, msg
  end

  # Report what was expected
  def expected(what, options={})
    got = options.has_key?(:got) ? options[:got] : @look
    got, what = *[got, what].map {|x| x.is_a?(Symbol) ? x : "'#{x}'" }
    if eof?
      raise ParseError.new(caller), "Premature end of file, expected: #{what}."
    else
      context = (@input.readline rescue '(EOF)').gsub("\n", "\\n")
      raise ParseError.new(caller, context), "Expected #{what} but got #{got}."
    end
  end


  # Recognize an alphabetical character.
  def alpha?(char)
    ('A'..'Z') === char.upcase
  end

  # Recognize a decimal digit.
  def digit?(char)
    ('0'..'9') === char
  end

  # Recognize an alphanumeric character.
  def alnum?(char)
    alpha?(char) || digit?(char)
  end

  def whitespace?(char)
    char == ' ' || char == "\t"
  end

  def any_whitespace?(char)
    char == ' ' || char == "\t" || char == "\n" || char == "\r"
  end

  # Parse one or more newlines.
  def newline
    if @look == "\n" || @look == "\r"
      get_char while @look == "\n" || @look == "\r"
    else
      expected(:newline)
    end
  end

  # Match a specific input character.
  def match(char)
    expected(char) unless @look == char
    get_char
    skip_whitespace
  end

  # Parse zero or more consecutive characters for which the test is
  # true.
  def many(test)
    token = ''
    while test[@look]
      token << @look
      get_char
    end
    skip_whitespace
    token
  end


  # Get an identifier.
  def get_name
    expected(:identifier) unless alpha?(@look)
    name = many(method(:alnum?))
    if @keywords.include?(name)
      expected(:identifier, :got => :keyword)
    end
    name
  end

  # Get a number.
  def get_num
    expected(:integer) unless digit?(@look)
    many(method(:digit?)).to_i
  end

  # Skip leading whitespace.
  def skip_whitespace
    get_char while whitespace?(@look)
  end

  # Skip leading whitespace including newlines.
  def skip_any_whitespace
    get_char while any_whitespace?(@look)
  end


  # Define a constant in the .data section.
  def equ(name, value)
    @data << "#{name}\tequ  #{value}"
  end

  # Define a variable with the given name and size (in dwords).
  def var(name, dwords=1)
    unless @vars[name]
      @bss << "#{name}: resd #{dwords}\n"
      @vars[name] = @bss_size
      @bss_size += dwords
    # else
    #   raise ParseError, "identifier #{name} redefined"
    end
  end

  # Emit a line of code wrapped between a tab and a newline.
  def emit(code, options={})
    tab = options.has_key?(:tab) ? options[:tab] : "\t"
    @code << "#{tab}#{code}\n"
  end

  def emit_label(name=unique_label)
    emit("#{name}:", :tab => nil)

    @labels[name] = @binary.length
  end

  def resolve_label(label)
    @labels[label]
  end

  # Generate a unique label.
  def unique_label(suffix=nil)
    @num_labels += 1
    if suffix
      @num_labels_with_suffix[suffix] += 1
      suffix = "_#{suffix}_#{@num_labels_with_suffix[suffix]}"
    end
    "L#{sprintf "%06d", @num_labels}#{suffix}"
  end


  # x86 machine code generation

  def emit_byte(byte)
    @binary << byte
  end

  def emit_dword(num)
    @binary += num_to_quad(num)
  end

  # 0-2: r/m
  # 3-5: reg/opcode
  # 6-7: mod
  #
  # dest and src are tuples of the form [type, value] where type is
  # any of :reg, :rm32, :imm32.  Max _one_ :rm32 arg per call.
  def emit_modrm(dest, src, override)
    if dest[0] == :reg
      reg = override[:op] || regnum(dest[1])

      # mod == 11 (register content)
      if src[0] == :reg
        mod = 3
        rm = regnum(src[1])

      # mod == 00 (pointer)
      elsif src[0] == :rm32
        mod = 0
        parts = decode_addr(src[1])
        rm = case parts[0]
             # mod == 00 (direct pointer e.g. [eax])
             when :reg
               regnum(parts[1])
             when :sib
               sib = parts[1..-1]
               4
             when :disp
               disp = parts[1]
               5
             end
      end
    elsif src[0] == :reg
      reg = override[:op] || regnum(src[1])
    else
      raise "unsupported mod r/m byte! dest=#{dest} src=#{src}"
    end
    emit_byte((mod << 6) & (reg << 3) & rm)
    emit_sib(sib) if defined? sib
    emit_dword(disp) if defined? disp
  end

  def emit_sib(sib)
    scale, index, base = *sib
    if [1,2,4,8].include?(scale)
      scale = log2(scale)
    else
      raise "unsupported SIB scale: #{scale}, should be [1, 2, 4, 8]"
    end
    emit_byte((scale << 6) & (index << 3) & base)
  end

  def compile
    @machine_code = @binary.pack('c*')
  end


  # Some asm methods for convenience and arity checks.  Now emits
  # some real machine code too.

  # This is the full set of x86 registers.
  # Registers = [:eax, :ecx, :edx, :ebx, :esp, :ebp, :esi, :edi]

  # This will do for early work.
  # Position indicates value in op codes.
  Registers = [:eax, :ecx, :edx, :ebx]

  # Regex to match any x86 register name, and then some.  Should be
  # sufficient.
  RegisterRegex = 'e[acdbsd][xip]'

  # Match a literal number in binary, octal, decimal, or hex
  NumberRegex = '(0[xXbB]?)?[0-9a-fA-F]+'

  # Match a variable name.
  NameRegex = '[a-zA-Z][a-zA-Z0-9]*'

  # 0.size gives the real answer, we only do x86 though
  MachineBytes = 4
  MachineBits = MachineBytes * 8
  MinSigned = -1 * 2**(MachineBits-1)
  MaxSigned = 2**(MachineBits-1) - 1
  MinUnsigned = 0
  MaxUnsigned = 2**MachineBits - 1
  SignedRange = MinSigned..MaxSigned

  # assemble x86 machine code
  def asm
    # stash the current number of bytes written
    instruction_offset = @binary.length

    yield

    # return the number of bytes written
    @binary.length - instruction_offset
  end

  def register?(op)
    Registers.index(op)
  end
  alias_method :regnum, :register?

  def immediate?(op)
    op.is_a?(Numeric) || (op.is_a?(String) && op.match(/^#{NumberRegex}$/))
  end

  def rm32?(op)
    offset?(op) || op.respond_to?(:match) && op.match(/^
      \[
       #{RegisterRegex}                       # base register
       (\+#{RegisterRegex}                    # optional index register
        (\*[1248])?                           # optional scale
       )?
      \]
      $/x)
  end

  # 6 versions of the mov instruction are supported:
  #   1.  mov reg32, immediate32 (0xb8+destreg, imm32)
  #   2.  mov reg32, r/m32 (0x8b, mod r/m, maybe sib)
  #   2a. mov eax, memoffset32 (0xa1, disp32)
  #   3.  mov r/m32, reg32 (0x89, mod r/m, maybe sib)
  #   3a. mov memoffset32, eax (0xa3, disp32)
  #   4.  mov r/m32, immediate32 (0xc7, mod r/m, maybe sib, imm32)
  def x86_mov(dest, src)
    emit("mov #{dest}, #{src}")

    dest = dest[6..-1] if dest.is_a?(String) && dest[0..5] == 'dword '
    src = src[6..-1] if src.is_a?(String) && src[0..5] == 'dword '

    asm do

      # version 1: mov r32, imm32
      if register?(dest) && immediate?(src)
          emit_byte(0xb8 + regnum(dest)) # dest encoded in instruction
          emit_dword(parse_num(src))

      # version 2: mov r32, r/m32
      elsif register?(dest) && rm32?(src)
        # version 2a: mov eax, moffs32
        if dest == :eax && offset?(src)
          emit_byte(0xa1)
          num = decode_addr(src)[1]
          emit_dword(num)
        else
          emit_byte(0x8b)
          emit_modrm([:reg, dest], [:rm32, src])
        end

      # version 3: mov r/m32, r32
      elsif rm32?(dest) && register?(src)
        # version 3a: mov moffs32, eax
        if offset?(dest) && src == :eax
          emit_byte(0xa3)
          num = decode_addr(dest)[1]
          emit_dword(num)
        else
          emit_byte(0x89)
          emit_modrm([:rm32, dest], [:reg, src])
        end

      # version 4: mov r/m32, imm32
      elsif rm32?(dest) && immediate?(src)
        emit_byte(0xc7)
        emit_modrm([:rm32, dest], [:imm32, src], :op => 0)
      else
        puts "rm32?(dest): #{rm32?(dest)}\t\trm32?(src): #{rm32?(src)}"
        puts "register?(dest): #{register?(dest)}\t\tregister?(src): #{register?(src)}"
        puts "immediate?(dest): #{immediate?(dest)}\t\timmediate?(src): #{immediate?(src)}"
        puts "offset?(dest): #{offset?(dest)}\t\toffset?(src): #{offset?(src)}"
        #raise "unsupported mov format: mov #{dest}, #{src}"
        puts "!!! unsupported mov format: mov #{dest}, #{src}"
      end

    end # asm do

  end


  def x86_add(dest, src)
    emit("add #{dest}, #{src}")
  end

  def x86_sub(dest, src)
    emit("sub #{dest}, #{src}")
  end

  def x86_imul(op)
    emit("imul #{op}")
  end

  def x86_idiv(op)
    emit("idiv #{op}")
  end

  def x86_push(reg)
    emit("push #{reg}")
  end

  def x86_call(label)
    emit("call #{label}")
  end

  def x86_neg(reg)
    emit("neg #{reg}")
  end

  def x86_xchg(op1, op2)
    emit("xchg #{op1}, #{op2}")
  end

  def x86_xor(op1, op2)
    emit("xor #{op1}, #{op2}")
  end

  def x86_jz(label)
    emit("jz #{label}")
  end

  def x86_jnz(label)
    emit("jnz #{label}")
  end

  def x86_jmp(label)
    emit("jmp #{label}")
  end

  def x86_cmp(a, b)
    emit("cmp #{a}, #{b}")
  end


  def offset?(addr)
    addr.respond_to?(:match) && addr.match(/^\[(#{NameRegex}|#{NumberRegex})\]$/)
  end

  def decode_addr(addr)
    addr = addr[1..-2]   # strip brackets

    if matches = addr.match(/^#{NameRegex}$/)
      unless loc = @vars[matches[0]]
        raise "undefined variable #{matches[0]}"
      end
      [:disp, @bss_offset + loc]
    elsif matches = addr.match(/^#{NumberRegex}$/)
      [:disp, parse_num(matches[0])]
    elsif addr.index('*')
      bi, scale = *addr.split('*')
      base, index = *bi.split('+')
      [:sib, scale.to_i, index.to_sym, base.to_sym]
    elsif addr.index('+')
      base, index = *addr.split('+')
      [:sib, 1, index.to_sym, base.to_sym]
    else
      [:reg, addr.to_sym]
    end
  end

  # Parse a number from a string.  Used by emit_dword.
  def parse_num(str)
    # If it's not a string it's a number, just return it.
    return str unless str.is_a?(String)

    str.downcase!
    base = 10                   # default to base 10
    if str[0, 1] == '0'
      base = case str[1, 1]
             when 'x'
               16
             when 'b'
               str.slice!(2..-1)
               2
             else
               8
             end
    end
    str.to_i(base)
  end

  # Convert a number to a quad of bytes, discarding excess bits.
  # Little endian!
  def num_to_quad(num)
    [
     num & 0xff,
     (num >>  8) & 0xff,
     (num >> 16) & 0xff,
     (num >> 24) & 0xff
    ]
  end

  def log2(x, tol=1e-13)
    result = 0.0

    # Integer part
    while x < 1
      resultp -= 1
      x *= 2
    end
    while x >= 2
      result += 1
      x /= 2
    end

    # Fractional part
    fp = 1.0
    while fp >= tol
      fp /= 2
      x *= x
      if x >= 2
        x /= 2
        result += fp
      end
    end
    result
  end

end