From f3158608f4c8c197c49108db5f4911edc2a85b43 Mon Sep 17 00:00:00 2001
From: Cesar Blum <cblum@mailbox.org>
Date: Sat, 26 Sep 2020 22:20:15 -0700
Subject: [PATCH] Initial commit.

---
 LICENSE                  |  19 ++
 Makefile                 |  24 +++
 README.md                | 109 ++++++++++
 examples/01-helloworld.f | 209 ++++++++++++++++++
 examples/02-quine.f      |  15 ++
 examples/03-variables.f  |  31 +++
 examples/04-debugstack.f |  80 +++++++
 examples/05-fizzbuzz.f   |  22 ++
 examples/README.md       | 119 +++++++++++
 sectorforth.asm          | 442 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 1070 insertions(+)
 create mode 100644 LICENSE
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 examples/01-helloworld.f
 create mode 100644 examples/02-quine.f
 create mode 100644 examples/03-variables.f
 create mode 100644 examples/04-debugstack.f
 create mode 100644 examples/05-fizzbuzz.f
 create mode 100644 examples/README.md
 create mode 100644 sectorforth.asm

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e53db62
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2020 Cesar Blum
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..46bb462
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,24 @@
+name = sectorforth
+
+all: $(name).bin $(name).img
+
+%.bin: %.asm
+	nasm -f bin -o $@ -l $(^:.asm=.lst) $^
+
+%.img: %.bin
+	dd if=$^ of=boot.img bs=512
+	dd if=/dev/zero of=zero.img bs=512 count=2879
+	cat boot.img zero.img > $@
+	rm -f boot.img zero.img
+
+.PHONY: debug
+gdb: $(name).bin
+	qemu-system-i386 -hda $^ -monitor stdio -s -S
+
+.PHONY: run
+run: $(name).bin
+	qemu-system-i386 -hda $^
+
+.PHONY: clean
+clean:
+	rm -rf *.{bin,lst,img}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..61b44bd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,109 @@
+# sectorforth
+
+sectorforth is a 16-bit x86 Forth that fits in a 512-byte boot sector.
+
+Inspiration to write sectorforth came from a 1996
+[Usenet thread](https://groups.google.com/g/comp.lang.forth/c/NS2icrCj1jQ)
+(in particular, Bernd Paysan's first post on the thread).
+
+## Batteries not included
+
+sectorforth contains only the eight primitives outlined in the Usenet
+post above, five variables for manipulating internal state, and two I/O
+primitives.
+
+With that minimal set of building blocks, words for branching, compiling,
+manipulating the return stack, etc. can all be written in Forth itself
+(check out the examples!).
+
+The colon compiler (`:`) is available, so new words can be defined easily
+(that means `;` is also there, of course).
+
+Contrary to many Forth implementations, sectorforth does not attempt to
+convert unknown words to numbers, since numbers can be produced using the
+available primitives. The two included I/O primitives are sufficient to
+write a more powerful interpreter that can parse numbers.
+
+### Primitives
+
+| Primitive | Stack effects | Description                                   |
+| --------- | ------------- | --------------------------------------------- |
+| `@`       | ( addr -- x ) | Fetch memory contents at addr                 |
+| `!`       | ( x addr -- ) | Store x at addr                               |
+| `sp@`     | ( -- sp )     | Get pointer to top of data stack              |
+| `rp@`     | ( -- rp )     | Get pointer to top of return stack            |
+| `0=`      | ( x -- flag ) | -1 if top of stack is 0, 0 otherwise          |
+| `+`       | ( x y -- z )  | Sum the two numbers at the top of the stack   |
+| `nand`    | ( x y -- z )  | NAND the two numbers at the top of the stack  |
+| `exit`    | ( r:addr -- ) | Pop return stack and resume execution at addr |
+| `key`     | ( -- x )      | Read key stroke as ASCII character)           |
+| `emit`    | ( x -- )      | Print low byte of x as an ASCII character     |
+
+### Variables
+
+| Variable | Description                                                   |
+| -------- | ------------------------------------------------------------- |
+| `state`  | 0: execute words; 1: compile word addresses to the dictionary |
+| `tib`    | Terminal input buffer, where input is parsed from             |
+| `>in`    | Current parsing offset into terminal input buffer             |
+| `here`   | Pointer to next free position in the dictionary               |
+| `latest` | Pointer to most recent dictionary entry                       |
+
+## Compiling
+
+sectorforth was developed using NASM 2.15.01. Earlier versions of NASM
+are probably capable of compiling it, but that hasn't been tested.
+
+To compile sectorforth, just run `make`:
+
+```
+$ make
+```
+
+That will produce a compiled binary (`sectorforth.bin`) and a floppy disk
+image (`sectorforth.img`) containing the binary in its boot sector.
+
+## Running
+
+The makefile contains two targets for running sectorforth in QEMU:
+
+- `debug` starts QEMU in debug mode, with execution paused. That allows
+you to set up a remote target in GDB (`target remote localhost:1234`) and
+set any breakpoints you want before sectorforth starts running.
+- `run` simply runs sectorforth in QEMU.
+
+## Usage
+
+Up to 4KB of input can be entered per line. After pressing return, the
+interpreter parses one word at a time an interprets it (i.e. executes it
+or compiles it, according to the current value of the `state` variable).
+
+sectorforth does not print the ` ok` prompt familiar to Forth users.
+However, if a word is not found in the dictionary, the error message `!!`
+is printed in red, letting you know an error happened.
+
+When a word is not found in the dictionary, the interpreter's state is
+reset: the data and return stacks, as well as the terminal input buffer
+are cleared, and the interpreter is placed in interpretation mode. Other
+errors (e.g. compiling an invalid address in a word definition and
+attempting to execute it) are not handled gracefully, and will crash the
+interpreter.
+
+## Code structure
+
+Comments throughout the code assume familiarity with Forth and how it is
+commonly implemented.
+
+If you're not familiar with Forth, read Leo Brodie's
+[Starting Forth](https://www.forth.com/starting-forth).
+
+If you're not familiar with how Forth is implemented on x86, read the
+assembly code for Richard W.M. Jones'
+[jonesforth](http://git.annexia.org/?p=jonesforth.git;a=blob;f=jonesforth.S)
+
+sectorforth draws a lot of inspiration from jonesforth, but the latter
+does a much better job at explaining the basics in its comments.
+
+For an excellent introduction to threaded code techniques, and to how to
+implement Forth in different architectures, read Brad Rodriguez's
+[Moving Forth](http://www.bradrodriguez.com/papers/moving1.htm)
diff --git a/examples/01-helloworld.f b/examples/01-helloworld.f
new file mode 100644
index 0000000..e14ad67
--- /dev/null
+++ b/examples/01-helloworld.f
@@ -0,0 +1,209 @@
+\ "hello, world" example for sectorforth, a 512-byte, bootable x86 Forth.
+\ Copyright (c) 2020 Cesar Blum
+\ Distributed under the MIT license. See LICENSE for details.
+
+: dup ( x -- x x ) sp@ @ ;
+
+\ make some numbers
+: -1 ( x -- x -1 ) dup dup nand dup dup nand nand ;
+: 0 -1 dup nand ;
+: 1 -1 dup + dup nand ;
+: 2 1 1 + ;
+: 4 2 2 + ;
+: 6 2 4 + ;
+
+\ logic and arithmetic operators
+: invert ( x -- !x ) dup nand ;
+: and ( x y -- x&y ) nand invert ;
+: negate ( x -- -x ) invert 1 + ;
+: - ( x y -- x-y ) negate + ;
+
+\ equality checks
+: = ( x y -- flag ) - 0= ;
+: <> ( x y -- flag ) = invert ;
+
+\ stack manipulation words
+: drop ( x y -- x ) dup - + ;
+: over ( x y -- x y x ) sp@ 2 + @ ;
+: swap ( x y -- y x ) over over sp@ 6 + ! sp@ 2 + ! ;
+: nip ( x y -- y ) swap drop ;
+: 2dup ( x y -- x y x y ) over over ;
+: 2drop ( x y -- ) drop drop ;
+
+\ more logic
+: or ( x y -- x|y ) invert swap invert and invert ;
+
+\ compile things
+: , ( x -- ) here @ ! here @ 2 + here ! ;
+
+\ left shift 1 bit
+: 2* ( x -- 2x ) dup + ;
+
+\ constant to check/set immediate flag
+: 80h ( -- 80h ) 1 2* 2* 2* 2* 2* 2* 2* ;
+
+\ make words immediate
+: immediate latest @ 2 + dup @ 80h or swap ! ;
+
+\ control interpreter state
+: [ 0 state ! ; immediate
+: ] 1 state ! ;
+
+\ unconditional branch
+: branch ( r:addr -- r:addr+offset ) rp@ @ dup @ + rp@ ! ;
+
+\ conditional branch when top of stack is 0
+: ?branch ( r:addr -- r:addr | r:addr+offset)
+    0= rp@ @ @ 2 - and rp@ @ + 2 + rp@ ! ;
+
+\ lit pushes the value on the next cell to the stack at runtime
+\ e.g. lit [ 42 , ] pushes 42 to the stack
+: lit ( -- x ) rp@ @ dup 2 + rp@ ! @ ;
+
+\ ['] is identical to lit, the choice of either depends on context
+\ don't write as : ['] lit ; as that will break lit's assumptions about
+\ the return stack
+: ['] ( -- addr ) rp@ @ dup 2 + rp@ ! @ ;
+
+\ push/pop return stack
+: >rexit ( addr r:addr0 -- r:addr )
+    rp@ ! ;                 \ override return address with original return
+                            \ address from >r
+: >r ( x -- r:x)
+    rp@ @                   \ get current return address
+    swap rp@ !              \ replace top of return stack with value
+    >rexit ;                \ push new address to return stack
+: r> ( r:x -- x )
+    rp@ 2 + @               \ get value stored in return stack with >r
+    rp@ @ rp@ 2 + !         \ replace value with address to return from r>
+    lit [ here @ 6 + , ]    \ get address to this word's exit call
+    rp@ ! ;                 \ make code return to this word's exit call,
+                            \ effectively calling exit twice to pop return
+                            \ stack entry created by >r
+
+\ rotate stack
+: rot ( x y z -- y z x ) >r swap r> swap ;
+
+\ if/then/else
+: if
+    ['] ?branch ,       \ compile ?branch to skip if's body when false
+    here @              \ get address where offset will be written
+    0 ,                 \ compile dummy offset
+    ; immediate
+: then
+    dup                 \ duplicate offset address
+    here @ swap -       \ calculate offset from if/else
+    swap !              \ store calculated offset for ?branch/branch
+    ; immediate
+: else
+    ['] branch ,        \ compile branch to skip else's body when true
+    here @              \ get address where offset will be written
+    0 ,                 \ compile dummy offset
+    swap                \ bring if's ?branch offset address to top of stack
+    dup here @ swap -   \ calculate offset from if
+    swap !              \ store calculated offset for ?branch
+    ; immediate
+
+\ begin...while...repeat and begin...until loops
+: begin
+    here @              \ get location to branch back to
+    ; immediate
+: while
+    ['] ?branch ,       \ compile ?branch to terminate loop when false
+    here @              \ get address where offset will be written
+    0 ,                 \ compile dummy offset
+    ; immediate
+: repeat
+    swap                        \ offset will be negative
+    ['] branch , here @ - ,     \ compile branch back to begin
+    dup here @ swap - swap !    \ compile offset from while
+    ; immediate
+: until
+    ['] ?branch , here @ - ,    \ compile ?branch back to begin
+    ; immediate
+
+\ do...loop loops
+: do ( end index -- )
+    here @                      \ get location to branch back to
+    ['] >r , ['] >r ,           \ at runtime, push inputs to return stack
+    ; immediate
+: loop
+    ['] r> , ['] r> ,           \ move current index and end to data stack
+    ['] lit , 1 , ['] + ,       \ increment index
+    ['] 2dup , ['] = ,          \ index equals end?
+    ['] ?branch , here @ - ,    \ when false, branch back to do
+    ['] 2drop ,                 \ discard index and end when loop terminates
+    ; immediate
+
+\ fetch/store bytes
+: 0fh lit [ 4 4 4 4 + + + 1 - , ] ;
+: ffh lit [ 0fh 2* 2* 2* 2* 0fh or , ] ;
+: c@ ( -- c ) @ ffh and ;
+: c! ( c addr -- )
+    dup @           \ fetch memory contents at address
+    ffh invert and  \ zero out low byte
+    rot ffh and     \ zero out high byte of value being stored
+    or swap !       \ overwrite low byte of existing contents
+    ;
+
+\ compile bytes
+: c, ( x -- ) here @ c! here @ 1 + here ! ;
+
+\ read literal string from word body
+: litstring ( -- addr len )
+    rp@ @ dup 2 + rp@ ! @   \ push length to stack
+    rp@ @                   \ push string address to stack
+    swap
+    2dup + rp@ ! ;          \ move return address past string
+
+\ print string
+: type ( addr len -- ) 0 do dup c@ emit 1 + loop drop ;
+
+\ read char from terminal input buffer, advance >in
+: in> ( "c<input>" -- c ) tib >in @ + c@ >in dup @ 1 + swap ! ;
+
+\ constant for space char
+: bl ( -- spc ) lit [ 1 2* 2* 2* 2* 2* , ] ;
+
+\ parse input with specified delimiter
+: parse ( delim "input<delim>" -- addr len )
+    in> drop                    \ skip space after parse
+    tib >in @ +                 \ put address of parsed input on stack
+    swap 0 begin                \ ( addr delim len )
+        over in>                \ ( addr delim len delim char )
+    <> while
+        1 +                     \ ( addr delim len+1 )
+    repeat swap                 \ ( addr len delim )
+    bl = if
+        >in dup @ 1 - swap !    \ move >in back 1 char if delimiter is bl,
+                                \ otherwise the interpreter is left in a
+                                \ bad state
+    then ;
+
+\ parse input with specified delimiter, skipping leading delimiters
+: word ( delim "<delims>input<delim>" -- addr len )
+    in> drop                    \ skip space after word
+    begin dup in> <> until      \ skip leading delimiters
+    >in @ 2 - >in !             \ "put back" last char read from tib,
+                                \ and backtrack >in leading char that will
+                                \ be skipped by parse
+    parse ;
+
+\ parse word, compile first char as literal
+: [char] ( "<spcs>input<spc>" -- c )
+    ['] lit , bl word drop c@ , ; immediate
+
+: ." ( "input<quote>" -- )
+    [char] " parse                      \ parse input up to "
+    state @ if
+        ['] litstring ,                 \ compile litstring
+        dup ,                           \ compile length
+        0 do dup c@ c, 1 + loop drop    \ compile string
+        ['] type ,                      \ display string at runtime
+    else
+        type                            \ display string
+    then ; immediate
+
+." hello, world"
+: hello ." hello, world" ;
+hello
diff --git a/examples/02-quine.f b/examples/02-quine.f
new file mode 100644
index 0000000..41f4ace
--- /dev/null
+++ b/examples/02-quine.f
@@ -0,0 +1,15 @@
+\ Quine example for sectorforth, a 512-byte, bootable x86 Forth.
+\ Copyright (c) 2020 Cesar Blum
+\ Distributed under the MIT license. See LICENSE for details.
+\ Depends on definitions built in the "hello, world" example.
+
+: 0<> 0= invert ;
+
+\ get address to input buffer and number of characters in it
+: source ( -- addr n )
+        tib dup
+        begin dup c@ 0<> while 1 + repeat
+        tib - ;
+
+\ prints itself
+source type
diff --git a/examples/03-variables.f b/examples/03-variables.f
new file mode 100644
index 0000000..9807e1e
--- /dev/null
+++ b/examples/03-variables.f
@@ -0,0 +1,31 @@
+\ Variables example for sectorforth, a 512-byte, bootable x86 Forth.
+\ Copyright (c) 2020 Cesar Blum
+\ Distributed under the MIT license. See LICENSE for details.
+\ Depends on definitions built in the "hello, world" example.
+
+\ constant to check/set hidden flag
+: 40h lit [ 1 2* 2* 2* 2* 2* 2* , ] ;
+
+\ make words visible
+: reveal latest @ 2 + dup @ 40h invert and swap ! ;
+
+\ creates a word that pushes the address to its body at runtime
+: create
+    :               \ parse word and create dictionary entry
+    ['] lit ,       \ compile lit
+    here @ 4 + ,    \ compile address past new word's exit call
+    ['] exit ,      \ compile exit
+    reveal          \ make created word visible
+    0 state !       \ switch back to interpretation state
+
+\ cells are 2 bytes wide
+: cells ( -- x ) lit [ 2 , ] ;
+
+\ reserve bytes in dictionary
+: allot ( x -- ) here @ + here ! ;
+
+: variable create 1 cells allot ;
+
+variable var
+2 var !
+var @ emit \ should print smiley face
diff --git a/examples/04-debugstack.f b/examples/04-debugstack.f
new file mode 100644
index 0000000..9d02713
--- /dev/null
+++ b/examples/04-debugstack.f
@@ -0,0 +1,80 @@
+\ Stack debugging example for sectorforth, a 512-byte, bootable x86 Forth.
+\ Copyright (c) 2020 Cesar Blum
+\ Distributed under the MIT license. See LICENSE for details.
+\ Depends on definitions built up to the variables examples.
+
+\ make a few more basic operators
+: ?dup dup ?branch [ 4 , ] dup ;
+: -rot ( x y z -- z x y ) rot rot ;
+: xor ( x y -- x^y) 2dup and invert -rot or and ;
+: 8000h lit [ 0 c, 80h c, ] ; \ little endian
+: >= ( x y -- flag ) - 8000h and 0= ;
+: < ( x y -- flag ) >= invert ;
+: <= ( x y -- flag ) 2dup < -rot = or ;
+: 0< ( x -- flag ) 0 < ;
+
+\ divison and modulo
+: /mod ( x y -- x%y x/y )
+    over 0< -rot                \ remainder negative if dividend is negative
+    2dup xor 0< -rot            \ quotient negative if operand signs differ
+    dup 0< if negate then       \ make divisor positive if negative
+    swap dup 0< if negate then  \ make dividend positive if negative
+    0 >r begin                  \ hold quotient in return stack
+        over 2dup >=            \ while divisor greater than dividend
+    while
+        -                       \ subtract divisor from dividend
+        r> 1 + >r               \ increment quotient
+    repeat
+    drop nip                    \ leave sign flags and remainder on stack
+    rot if negate then          \ set remainder sign
+    r> rot                      \ get quotient from return stack
+    if negate then ;            \ set quotient sign
+: / /mod nip ;
+: mod /mod drop ;
+
+\ constants for decimal and hexadecimal 10 (i.e. 10 and 16)
+: 10 lit [ 4 4 2 + + , ] ;
+: 10h lit [ 4 4 4 4 + + + , ] ;
+
+variable base
+10 base !
+
+\ switch to common bases
+: hex 10h base ! ;
+: decimal 10 base ! ;
+
+\ convert number to ASCII digit
+: digit ( x -- c )
+    dup 10 < if [char] 0 + else 10 - [char] A + then ;
+
+\ print space
+: space bl emit ;
+
+\ print number at the top of the stack in current base
+: . ( x -- )
+    -1 swap                                     \ put sentinel on stack
+    dup 0< if negate -1 else 0 then             \ make positive if negative
+    >r                                          \ save sign on return stack
+    begin base @ /mod ?dup 0= until             \ convert to base 10 digits
+    r> if [char] - emit then                    \ print sign
+    begin digit emit dup -1 = until drop        \ print digits
+    space ;                                     \ print space
+
+\ base of data stack
+: sp0 lit [ sp@ , ] ;
+
+\ print backspace
+: backspace lit [ 4 4 + , ] emit ;
+
+\ print stack
+: .s
+    sp@ 0 swap begin
+        dup sp0 <
+    while
+        2 +
+        swap 1 + swap
+    repeat swap
+    [char] < emit dup . backspace [char] > emit space
+    ?dup if
+        0 do 2 - dup @ . loop
+    then drop ;
diff --git a/examples/05-fizzbuzz.f b/examples/05-fizzbuzz.f
new file mode 100644
index 0000000..3c073e3
--- /dev/null
+++ b/examples/05-fizzbuzz.f
@@ -0,0 +1,22 @@
+\ FizzBuzz example for sectorforth, a 512-byte, bootable x86 Forth.
+\ Copyright (c) 2020 Cesar Blum
+\ Distributed under the MIT license. See LICENSE for details.
+\ Depends on definitions built up to the stack debugging example.
+
+\ get do...loop index
+: i ( -- index ) rp@ 4 + @ ;
+
+\ make more numbers
+: 3 1 2 + ;
+: 5 2 3 + ;
+
+\ newline
+: cr lit [ 4 6 3 + + , ] lit [ 4 6 + , ] emit emit ;
+
+: fizzbuzz ( x -- )
+    cr 1 + 1 do
+        i 3 mod 0= dup if ." Fizz" then
+        i 5 mod 0= dup if ." Buzz" then
+        or invert if i . then
+        cr
+    loop ;
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..d546569
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,119 @@
+# A note on return stack manipulation
+
+In these examples, some defintions like `branch` and `lit` do a fair
+bit of return stack manipulation that may not be immediately intuitive
+to grasp.
+
+The key to understanding how those definitions work is in how Forth's
+[threaded code](https://en.wikipedia.org/wiki/Threaded_code) is executed.
+
+A word's body is comprised of a sequence of addresses to other words it
+calls. One of the processor's registers (`SI`, in the case of sectorforth)
+works as Forth's "instruction pointer", which is distinct from the
+processor's instruction pointer.
+
+Consider the following word definition:
+
+```forth
+: w4 w1 w2 w3 ;
+```
+
+Its body is laid out in memory like this:
+
+```
+address               addr1           addr2           addr3
+                *---------------*---------------*---------------*
+contents        | address of w1 | address of w2 | address of w3 |
+                *---------------*---------------*---------------*
+size                 2 bytes         2 bytes         2 bytes
+```
+
+When `w4` is about to be executed, `SI` points to its first cell:
+
+```
+address               addr1           addr2           addr3
+                *---------------*---------------*---------------*
+contents        | address of w1 | address of w2 | address of w3 |
+                *---------------*---------------*---------------*
+size                 2 bytes         2 bytes         2 bytes
+                        ^
+                        |
+                        *--- SI
+```
+
+When `w4` starts executing and calls `w1`, two things happen:
+
+- `SI` is advanced to the next cell (i.e. `SI = SI + 2`)
+- `SI` is pushed to the return stack
+
+Which means that if `w1` were to fetch the contents of the return stack
+(`rp@ @`), it would get `addr2` as a result.
+
+Now, when a word finishes executing, it calls `exit`, which pops the
+return stack, and sets `SI` to the popped address so that execution
+resumes there. In the example above, the execution of `w4` would
+resume right past the point where it called `w1`, calling `w2` next.
+
+What if `w1` were to do the following though:
+
+```forth
+... rp@ @ 2 + rp@ ! ...
+```
+
+`rp@ @ 2 +` would fetch the top of the return stack, yielding `addr2`,
+then it would add 2 to it, resulting in `addr3`. `rp@ !` would then
+replace the value at the top of the return stack with `addr3`.
+
+In that situation, when `w1` calls `exit`, the top of the return stack
+is popped, yielding `addr3` this time, and execution resumes there,
+skipping the call to `w2` in the body of `w4` and going straight to `w3`.
+
+That's how `branch`, `lit`, and other definitions that manipulate the
+return stack work. `branch` reads an offset from the top of the return
+stack (`rp@ @ @` reads the contents of the address at the top of the
+return stack) and adds that offset to the address at the top of the return
+stack itself (`rp@ @ + rp@ !`), so execution skips a number of words
+corresponding to the offset (it actually skips bytes, so offsets always
+have to be multiples of 2 to skip words). Like `branch`, `lit` reads a
+value from the address at the top of the return stack, but always adds 2
+to that same address so execution skips the literal (since attemping to
+execute the literal value itself would not make sense).
+
+The most involved definitions in terms of manipulating the return stack
+are `>r` and `r>`, which push and pop arbitrary values to and from the
+return stack itself:
+
+```forth
+: >rexit ( addr r:addr0 -- r:addr )
+    rp@ ! ;                 \ override return address with original return
+                            \ address from >r
+: >r ( x -- r:x)
+    rp@ @                   \ get current return address
+    swap rp@ !              \ replace top of return stack with value
+    >rexit ;                \ push new address to return stack
+: r> ( r:x -- x )
+    rp@ 2 + @               \ get value stored in return stack with >r
+    rp@ @ rp@ 2 + !         \ replace value with address to return from r>
+    lit [ here @ 6 + , ]    \ get address to this word's exit call
+    rp@ ! ;                 \ make code return to this word's exit call,
+                            \ effectively calling exit twice to pop return
+                            \ stack entry created by >r
+```
+
+`>r` uses an auxiliary word, `>rexit`, to push a new
+address to the return stack (remember, an address is pushed every time a
+word is called, so calling `>rexit` will do just that), then replaces it
+with the return address that was pushed when `>r` was called. *That*
+original address can thus be replaced with whatever value was on the data
+stack when `>r` was called. When `>r` exits, the value left at the top of
+the return stack is the argument to `>r`.
+
+`r>` is a bit more complicated. In addition to reading a value placed on
+the return stack by `>r` earlier, `r>` needs to pop that off. Evidently,
+it cannot do so via an auxiliary word like `>r` does, since that would
+only _push_ yet another address on the return stack. Instead, it obtains
+the address to its `exit` call (located where `;` is), and replaces the
+value pushed by `>r` with it. When `r>` calls `exit` the first time,
+execution goes back _to that same exit call_ one more time, popping off
+the return stack space created by `>r`; the second call to `exit` then
+pops the address to return to wherever `r>` was called.
diff --git a/sectorforth.asm b/sectorforth.asm
new file mode 100644
index 0000000..db037f8
--- /dev/null
+++ b/sectorforth.asm
@@ -0,0 +1,442 @@
+        ; sectorforth - a 512-byte, bootable x86 Forth.
+        ; Copyright (c) 2020 Cesar Blum
+        ; Distributed under the MIT license. See LICENSE for details.
+        ;
+        ; sectorforth is a 16-bit x86 Forth that fits entirely within a
+        ; boot sector (512 bytes).
+        ;
+        ; It's a direct threaded Forth, with SI acting as the Forth
+        ; instruction pointer. Words are executed using LODSW to advance
+        ; SI and load the next word's address into AX, which is then
+        ; jumped to.
+        ;
+        ; The SP register is used as the data stack pointer, and the BP
+        ; register acts as the return stack pointer.
+        ;
+        ; The minimum CPU required to run sectorforth is the 386, to use
+        ; the SETNZ instruction.
+        bits 16
+        cpu 386
+
+        ; Set CS to a known value by performing a far jump. Memory up to
+        ; 0x0500 is used by the BIOS. Setting the segment to 0x0500 gives
+        ; sectorforth an entire free segment to work with.
+        jmp 0x0050:start
+
+        ; On x86, the boot sector is loaded at 0x7c00 on boot. In segment
+        ; 0x0500, that's 0x7700 (0x0500 << 4 + 0x7700 == 0x7c00).
+        org 0x7700
+
+        ; Define constants for the memory map. Everything is organized
+        ; within a single 64 KB segment. TIB is placed at 0x0000 to
+        ; simplify input parsing (the Forth variable >IN ends up being
+        ; also a pointer into TIB, so there's no need to add >IN to TIB
+        ; to get a pointer to the parse area). TIB is 4 KB long.
+TIB             equ 0x0000      ; terminal input buffer (TIB)
+STATE           equ 0x1000      ; current state (0=interpret, 1=compile)
+TOIN            equ 0x1002      ; current read offset into TIB (>IN)
+RP0             equ 0x76fe      ; bottom of return stack
+SP0             equ 0xfffe      ; bottom of data stack
+
+        ; Each dictionary entry is laid out in memory as such:
+        ;
+        ; *--------------*--------------*--------------*--------------*
+        ; | Link pointer | Flags+Length |    Name...   |    Code...   |
+        ; *--------------*--------------*--------------*--------------*
+        ;     2 bytes         1 byte      Length bytes     Variable
+        ;
+        ; Flags IMMEDIATE and HIDDEN are used in assembly code. Room is
+        ; left for an additional, user-defined flag, so word names are
+        ; limited to 32 characters.
+F_IMMEDIATE     equ 0x80
+F_HIDDEN        equ 0x40
+LENMASK         equ 0x1f
+
+        ; Each dictionary entry needs a link to the previous entry.  The
+        ; initlink macro uses the nasm context stack to push a context and
+        ; define a context-local macro called %$link that represents the
+        ; end of the dictionary.
+%macro initlink 0
+%push
+%define %$link 0
+%endmacro
+
+        initlink                ; expand initlink so %$link is initialized
+
+        ; The link macro links dictionary entries by pushing a new context
+        ; and redefining %$link as a context-local label at the current
+        ; location (address) in the code. It then writes a 16-bit data
+        ; word with the value of %$link in the previous context (i.e. the
+        ; previous expansion of link), effectively writing a link field to
+        ; the previous location.
+%macro link 0
+%push
+%$link:
+        dw %$$link
+%endmacro
+
+        ; defword lays out a dictionary entry where it is expanded.
+%macro defword 3-4 0            ; name, length, label, flags
+word_%3:
+        link                    ; link to previous word
+        db %4+%2                ; flags+length
+        db %1                   ; name
+%3:                             ; code starts here
+%endmacro
+
+        ; NEXT advances execution to the next word. The actual code is
+        ; placed further ahead for strategic reasons. The macro has to be
+        ; defined here, since it's used in the words defined ahead.
+%define NEXT jmp next
+
+        ; sectorforth has only eight primitive words, with which
+        ; everything else can be built in Forth:
+        ;
+        ; @ ( addr -- x )       Fetch memory at addr
+        ; ! ( x addr -- )       Store x at addr
+        ; sp@ ( -- addr )       Get current data stack pointer
+        ; rp@ ( -- addr )       Get current return stack pointer
+        ; 0= ( x -- f )         -1 if top of stack is 0, 0 otherwise
+        ; + ( x1 x2 -- n )      Add the two values at the top of the stack
+        ; nand ( x1 x2 -- n )   NAND the two values at the top of the stack
+        ; exit ( r:addr -- )    Resume execution at address at the top of
+        ;                       the return stack
+        defword "@",1,FETCH
+        pop bx
+        push word [bx]
+        NEXT
+
+        defword "!",1,STORE
+        pop bx
+        pop word [bx]
+        NEXT
+
+        defword "sp@",3,SPFETCH
+        push sp
+        NEXT
+
+        defword "rp@",3,RPFETCH
+        push bp
+        NEXT
+
+        defword "0=",2,ZEROEQUALS
+        pop ax
+        test ax,ax
+        setnz al                ; AL=0 if ZF=1, else AL=1
+        dec ax                  ; AL=ff if AL=0, else AL=0
+        cbw                     ; AH=AL
+        push ax
+        NEXT
+
+        defword "+",1,PLUS
+        pop bx
+        pop ax
+        add ax,bx
+        push ax
+        NEXT
+
+        defword "nand",4,NAND
+        pop bx
+        pop ax
+        and ax,bx
+        not ax
+        push ax
+        NEXT
+
+        defword "exit",4,EXIT
+        xchg sp,bp              ; swap SP and BP, SP controls return stack
+        pop si                  ; pop address to next word
+        xchg sp,bp              ; restore SP and BP
+        NEXT
+
+        ; Besides primitives, a few variables are exposed to Forth code:
+        ; TIB, STATE, >IN, HERE, and LATEST. With sectorforth's >IN being
+        ; both an offset and a pointer into TIB (as TIB starts at 0x0000),
+        ; TIB could be left out. But it is exposed so that sectorforth
+        ; code that accesses the parse area can be written in an idiomatic
+        ; fashion (e.g. TIB >IN @ +).
+        defword "tib",3,TIBVAR
+        push word TIB
+        NEXT
+
+        defword "state",5,STATEVAR
+        push word STATE
+        NEXT
+
+        defword ">in",3,TOINVAR
+        push word TOIN
+        NEXT
+
+        ; Strategically define next here so most jumps to it are short,
+        ; saving extra bytes that would be taken by near jumps.
+next:
+        lodsw                   ; load next word's address into AX
+        jmp ax                  ; jump directly to it
+
+        ; Words and data space for the HERE and LATEST variables.
+        defword "here",4,HEREVAR
+        push word HERE
+        NEXT
+HERE:   dw start_HERE
+
+        defword "latest",6,LATESTVAR
+        push word LATEST
+        NEXT
+LATEST: dw word_SEMICOLON       ; initialized to last word in dictionary
+
+        ; Define a couple of I/O primitives to make things interactive.
+        ; They can also be used to build a richer interpreter loop.
+        ;
+        ; KEY waits for a key press and pushes its scan code (AH) and
+        ; ASCII character (AL) to the stack, both in a single cell.
+        defword "key",3,KEY
+        mov ah,0
+        int 0x16
+        push ax
+        NEXT
+
+        ; EMIT writes to the screen the ASCII character corresponding to
+        ; the lowest 8 bits of the value at the top of the stack.
+        defword "emit",4,EMIT
+        pop ax
+        call writechar
+        NEXT
+
+        ; The colon compiler reads a name from the terminal input buffer,
+        ; creates a dictionary entry for it, writes machine code to jump
+        ; to DOCOL, updates LATEST and HERE, and switches to compilation
+        ; state.
+        defword ":",1,COLON
+        call token              ; parse word from input
+        push si
+        mov si,di               ; set parsed word as string copy source
+        mov di,[HERE]           ; set current value of HERE as destination
+        mov ax,[LATEST]         ; get pointer to latest defined word
+        mov [LATEST],di         ; update LATEST to new word being defined
+        stosw                   ; link pointer
+        mov al,cl
+        or al,F_HIDDEN          ; hide new word while it's being defined
+        stosb                   ; word length
+        rep movsb               ; word name
+        mov ax,0x26ff
+        stosw                   ; compile near jump, absolute indirect...
+        mov ax,DOCOL.addr
+        stosw                   ; ...to DOCOL
+        mov [HERE],di           ; update HERE to next free position
+        mov byte [STATE],1      ; switch to compilation state
+        pop si
+        NEXT
+
+        ; DOCOL sets up and starts execution of a user-defined words.
+        ; Those differ from words defined in machine code by being
+        ; sequences of addresses to other words, so a bit of code is
+        ; needed to save the current value of SI (this Forth's instruction
+        ; pointer), and point it to the sequence of addresses that makes
+        ; up a word's body.
+        ;
+        ; DOCOL advances AX 4 bytes, and then moves that value to SI. When
+        ; DOCOL is jumped to, AX points to the code field of the word
+        ; about to be executed. The 4 bytes being skipped are the actual
+        ; jump instruction to DOCOL itself, inserted by the colon compiler
+        ; when it creates a new entry in the dictionary.
+DOCOL:
+        xchg sp,bp              ; swap SP and BP, SP controls return stack
+        push si                 ; push current "instruction pointer"
+        xchg sp,bp              ; restore SP and BP
+        add ax,4                ; skip word's code field
+        mov si,ax               ; point "instruction pointer" to word body
+        NEXT                    ; start executing the word
+
+        ; The jump instruction inserted by the compiler is an indirect
+        ; jump, so it needs to read the location to jump to from another
+        ; memory location.
+.addr:  dw DOCOL
+
+        ; Semicolon is the only immediate primitive. It writes the address
+        ; of EXIT to the end of a new word definition, makes the word
+        ; visible in the dictionary, and switches back to interpretation
+        ; state.
+        defword ";",1,SEMICOLON,F_IMMEDIATE
+        mov bx,[LATEST]
+        and byte [bx+2],~F_HIDDEN       ; reveal new word
+        mov byte [STATE],0      ; switch to interpretation state
+        mov ax,EXIT             ; prepare to compile EXIT
+compile:
+        mov di,[HERE]
+        stosw                   ; compile contents of AX to HERE
+        mov [HERE],di           ; advance HERE to next cell
+        NEXT
+
+        ; Execution starts here.
+start:
+        cld                     ; clear direction flag
+
+        ; Set up segment registers to point to the same segment as CS.
+        push cs
+        push cs
+        push cs
+        pop ds
+        pop es
+        pop ss
+
+        ; Skip error signaling on initialization
+        jmp init
+
+        ; Display a red '!!' to let the user know an error happened and the
+        ; interpreter is being reset
+error:
+        mov ax,0x0921           ; write '!'
+        mov bx,0x0004           ; black background, red text
+        mov cx,2                ; twice
+        int 0x10
+
+        ; Initialize stack pointers, state, and terminal input buffer.
+init:
+        mov bp,RP0              ; BP is the return stack pointer
+        mov sp,SP0              ; SP is the data stack pointer
+
+        ; Fill TIB with zeros, and set STATE and >IN to 0
+        mov al,0
+        mov cx,STATE+4
+        mov di,TIB
+        rep stosb
+
+        ; Enter the interpreter loop.
+        ;
+        ; Words are read one at time and searched for in the dictionary.
+        ; If a word is found in the dictionary, it is either interpreted
+        ; (i.e. executed) or compiled, depending on the current state and
+        ; the word's IMMEDIATE flag.
+        ;
+        ; When a word is not found, the state of the interpreter is reset:
+        ; the data and return stacks are cleared as well as the terminal
+        ; input buffer, and the interpreter goes into interpretation mode.
+interpreter:
+        call token              ; parse word from input
+        mov bx,[LATEST]         ; start searching for it in the dictionary
+.1:     test bx,bx              ; zero?
+        jz error                ; not found, reset interpreter state
+        mov si,bx
+        lodsw                   ; skip link
+        lodsb                   ; read flags+length
+        mov dl,al               ; save those for later use
+        test al,F_HIDDEN        ; entry hidden?
+        jnz .2                  ; if so, skip it
+        and al,LENMASK          ; mask out flags
+        cmp al,cl               ; same length?
+        jne .2                  ; if not, skip entry
+        push cx
+        push di
+        repe cmpsb              ; compare strings
+        pop di
+        pop cx
+        je .3                   ; if equal, search is over
+.2:     mov bx,[bx]             ; skip to next entry
+        jmp .1                  ; try again
+.3:     mov ax,si               ; after comparison, SI points to code field
+        mov si,.loop            ; set SI so NEXT loops back to interpreter
+        ; Decide whether to interpret or compile the word. The IMMEDIATE
+        ; flag is located in the most significant bit of the flags+length
+        ; byte. STATE can only be 0 or 1. When ORing those two, these are
+        ; the possibilities:
+        ;
+        ; IMMEDIATE     STATE         OR   ACTION
+        ;   0000000   0000000   00000000   Interpret
+        ;   0000000   0000001   00000001   Compile
+        ;   1000000   0000000   10000000   Interpret
+        ;   1000000   0000001   10000001   Interpret
+        ;
+        ; A word is only compiled when the result of that OR is 1.
+        ; Decrementing that result sets the zero flag for a conditional
+        ; jump.
+        and dl,F_IMMEDIATE      ; isolate IMMEDIATE flag
+        or dl,[STATE]           ; OR with state
+        dec dl                  ; decrement
+        jz compile              ; if result is zero, compile
+        jmp ax                  ; otherwise, interpret (execute) the word
+.loop:  dw interpreter
+
+        ; Parse a word from the terminal input buffer and return its
+        ; address and length in DI and CX, respectively.
+        ;
+        ; If after skipping spaces a 0 is found, more input is read from
+        ; the keyboard into the terminal input buffer until return is
+        ; pressed, at which point execution jumps back to the beginning of
+        ; token so it can attempt to parse a word again.
+        ;
+        ; Before reading input from the keyboard, a CRLF is emitted so
+        ; the user can enter input on a fresh, blank line on the screen.
+token:
+        mov di,[TOIN]           ; starting at the current position in TIB
+        mov cx,-1               ; search "indefinitely"
+        mov al,32               ; for a character that's not a space
+        repe scasb
+        dec di                  ; result is one byte past found character
+        cmp byte [di],0         ; found a 0?
+        je .readline            ; if so, read more input
+        mov cx,-1               ; search "indefinitely" again
+        repne scasb             ; this time, for a space
+        dec di                  ; adjust DI again
+        mov [TOIN],di           ; update current position in TIB
+        not cx                  ; after ones' complement, CX=length+1
+        dec cx                  ; adjust CX to correct length
+        sub di,cx               ; point to start of parsed word
+        ret
+.readline:
+        mov al,13
+        call writechar          ; CR
+        mov al,10
+        call writechar          ; LF
+        mov di,TIB              ; read into TIB
+.1:     mov ah,0                ; wait until a key is pressed
+        int 0x16
+        cmp al,13               ; return pressed?
+        je .3                   ; if so, finish reading
+        cmp al,8                ; backspace pressed?
+        je .2                   ; if so, erase character
+        call writechar          ; otherwise, write character to screen
+        stosb                   ; store character in TIB
+        jmp .1                  ; keep reading
+.2:     cmp di,TIB              ; start of TIB?
+        je .1                   ; if so, there's nothing to erase
+        dec di                  ; erase character in TIB
+        call writechar          ; move cursor back one character
+        mov ax,0x0a20           ; erase without moving cursor
+        mov cx,1
+        int 0x10                ; (BH already set to 0 by writechar)
+        jmp .1                  ; keep reading
+.3:     mov ax,0x0020
+        stosw                   ; put final delimiter and 0 in TIB
+        call writechar          ; write a space between user input and
+                                ; execution output
+        mov word [TOIN],0       ; point >IN to start of TIB
+        jmp token               ; try parsing a word again
+
+        ; writechar writes a character to the screen. It uses INT 10/AH=0e
+        ; to perform teletype output, writing the character, updating the
+        ; cursor, and scrolling the screen, all in one go. Writing
+        ; backspace using the BIOS only moves the cursor backwards within
+        ; a line, but does not move it back to the previous line.
+        ; writechar addresses that.
+writechar:
+        mov bh,0                ; video page 0 for all BIOS calls
+        mov ah,3                ; get cursor position (DH=row, DL=column)
+        int 0x10
+        mov ah,0x0e             ; teletype output
+        mov bl,0x7              ; black background, light grey text
+        int 0x10
+        cmp al,8                ; backspace?
+        jne .1                  ; if not, nothing else to do
+        test dl,dl              ; was cursor in first column?
+        jnz .1                  ; if not, nothing else to do
+        mov ah,2                ; move cursor
+        mov dl,79               ; to last column
+        dec dh                  ; of previous row
+        int 0x10
+.1:     ret
+
+        times 510-($-$$) db 0
+        db 0x55, 0xaa
+
+        ; New dictionary entries will be written starting here.
+start_HERE: