elisp.js/elisp/parser.js

////
// Emacs Lisp implementation in JavaScript.
//
// Copyright (c) 2009 Sami Samhuri - sami.samhuri@gmail.com
//
// Released under the terms of the MIT license.  See the included file
// LICENSE.

var utils = require('elisp/utils'),
    type = require('elisp/types');

var Parser = function(data) {
    this.data = data || '';
};

Parser.Error = function(name, message) {
    this.parserError = true;
    this.name = name;
    this.message = message;
};

Parser.Error.messages = {
    'eof': "no more input"
};

Parser.prototype.error = function(name) {
    throw(new Parser.Error(name, Parser.Error.messages[name]));
};

Parser.prototype.peek = function() {
    return this.data[this.pos];
};

Parser.prototype.consumeChar = function() {
    if (this.pos >= this.data.length) this.error('eof');
    return this.data[this.pos++];
};

Parser.prototype.consumeWhitespace = function() {
    var c;
    while ((c = this.peek()) && c.match(/[\s\n]/)) {
	this.consumeChar();
    }
};

Parser.prototype.rewind = function() {
    this.pos = 0;
};

Parser.prototype.rest = function() {
    return this.data.substring(this.pos);
};

Parser.prototype.moreInput = function() {
    return (this.pos < this.data.length);
};

Parser.prototype.parse = function(string) {
    if (string) this.data = string;
    this.rewind();
    var exprs = [];
    while (this.moreInput()) {
	try {
	    exprs.push(this.parseExpression());
	} catch (e) {
	    if (e.parserError && e.name == 'eof') {
		print("error: " + e.message);
		break;
	    }
	    else {
		throw(e);
	    }
	}
    }
    this.expressions = exprs;
//     print('');
//     Utils.pp(exprs);
//     print('');
    return type.mkList(exprs);
};

Parser.prototype.parseOne = function(string) {
    return this.parse(string).car();
};

Parser.prototype.parseUntil = function(regex, initial, next, consumeTerminator) {
    var c,
	token = initial,
	condition = function(c){ return c.match(regex) === null; };
    while ((c = this.peek()) && condition(c)) {
	token = next(token, this.consumeChar());
    }
    if (consumeTerminator && this.peek()) this.consumeChar();
    return token;
};

Parser.prototype.parseList = function() {
    var list = [],
	expr;
    // consume initial paren '('
    this.consumeChar();
    while ((expr = this.parseExpression()) && expr != ')') {
	list.push(expr);
    }
    return type.mkList(list);
};

Parser.prototype.parseCons = function() {
    var car, cdr, expr;

    // consume initial paren '('
    this.consumeChar();

    car = this.parseExpression();

    // ignore .
    this.parseExpression();

    cdr = this.parseExpression();
    return new type.LispCons(car, cdr);
};

Parser.prototype.parseString = function() {
    // consume initial quotation mark
    this.consumeChar();
    var self = this;
    return new type.LispString(this.parseUntil(/"/, '', function(s,c){
	if (c == '\\') {
	    c = self.consumeChar();
	}
	return s + c;
    }, true /* consume terminator */));
};

Parser.prototype.parseSymbol = function() {
    var symbol = this.parseUntil(/[\s()]/, '', function(t,c){return t + c;});
    return new type.LispSymbol(symbol);
};

// Probably easy to break
Parser.prototype.parseRegex = function() {
    // consume initial slash
    this.consumeChar();
    var self = this;
    return new RegExp(this.parseUntil(/\//, '', function(s,c){
	if (c == '\\') {
	    c = self.consumeChar();
	}
	return s + c;
    }, true /* consume terminator */));
};


// In Emacs Lisp a trailing . is allowed on integers.
// Valid kinds of numbers we parse here are:
//
//   * Integers of the form 42, +17, -300, 7300. (trailing .), +1. and
//     -1.
//
//   * Floating point numbers of the form -4.5, 0.0, and +933825.3450133492
//
//   * Exponential notation for floats, e.g. 1.5e2 (150.0) or 420e-1 (42.0)
//     (There is no trailing . allowed anywhere in exponent notation)
//
// Binary, octal, hex, or arbitrary radix integers not yet parsed.
//  (e.g. #x100 == #o400 == #b100000000 == #24rag
Parser.prototype.parseNumber = function() {
    var value = this.parseIntOrFloat(),
	exponentAllowed = value === parseInt(value, 10),
	exp;

    // now check for an exponent
    if (this.exponentAllowed && (this.peek() == 'e' || this.peek() == 'E')) {
	this.consumeChar();

	// Technically this is an error as a float is not allowed for exponents
	// but the regex is strict enough to keep us from trying to do that.
	exp = this.parseIntOrFloat();

	value *= Math.pow(10, exp);
    }

    return new type.LispNumber(value);
};

// Pack int and float parsing together for simplicity's sake.
Parser.prototype.parseIntOrFloat = function() {
    this.exponentAllowed = true;
    var sign = this.peek() == '-' || this.peek() == '+' ? this.consumeChar() : '+',
	value;

    // There may or may not be an integer part of the number.
    if (this.peek() != '.') {
	value = this.parseUntil(/[^\d]/, 0, function(n,c) {
            return n*10 + parseInt(c, 10);
        });
    }

    // if we see a . there might be a float to parse
    if (this.peek() == '.') {
	this.consumeChar();
	if (this.peek() && this.peek().match(/\d/)) {
	    var decimal = this.parseUntil(/[^\d]/, '', function(s,c) {return s + c;});
	    // value may be undefined at this point
	    value = parseFloat('' + (value||'') + '.' + decimal);
	}
	else {
	    this.exponentAllowed = false;
	}
    }

    // Value can technically be undefined but the regex prevents it from
    // ever being so.

    return sign == '-' ? -1*value : value;
};

// These regexes matches all the inputs specified above parseNumber.
// They are paramount as they exclude some invalid cases the parser
// itself doesn't catch.  Sloppy, should be fixed in the future.
// The reason there are so many is that we can't match the end of
// string or some chars in the same regex.
//
// TODO: pick up Friedl and find a way to consolidate these.
Parser.prototype.lookingAtNumber = function() {
    var pos = this.pos,
	rest = this.rest(),
	match = rest.match(/^[+-]?\d+(\.\d*)?[)\s\n]/)              ||
	      rest.match(/^[+-]?\d+(\.\d*)?$/)                      ||
	      rest.match(/^[+-]?\d+(\.\d+)?([eE][+-]?\d+)?[)\s\n]/) ||
	      rest.match(/^[+-]?\d+(\.\d+)?([eE][+-]?\d+)?$/)       ||
	      rest.match(/^[+-]?(\d+)?\.\d+([eE][+-]?\d+)?[)\s\n]/) ||
	      rest.match(/^[+-]?(\d+)?\.\d+([eE][+-]?\d+)?$/);
    return (match !== null);
};

Parser.prototype.lookingAtCons = function() {
    // FIXME
    return false;

    var orig_pos = this.pos,
	_ = this.consumeChar(),
	__ = _ && this.peek() && this.parseExpression(),
	cdr = __ && this.peek() &&this.parseExpression();
    this.pos = orig_pos; // rewind, like it never happened.
//    print('[Parser.lookingAtCons]');
    return _ == ')' || cdr.isCons() && cdr.isSymbol() && cdr.symbolName() == '.';
};

Parser.prototype.parseExpression = function() {
    var value,
	c = this.peek();
    if (c == '(' && this.lookingAtCons()) {
	value = this.parseCons();
    }
    else if (c == '(') {
	value = this.parseList();
    }
    else if (c == ')') {
	return this.consumeChar();
    }
    else if (c == "'") {
	this.consumeChar();
	value = new type.LispCons(new type.LispSymbol('quote'), this.parseExpression());
    }
    else if (c == '"') {
	value = this.parseString();
    }
    else if (this.lookingAtNumber()) {
	value = this.parseNumber();
    }
    else if (c) {
	value = this.parseSymbol();
    }
    else {
	if (this.pos == this.data.length) {
	    print('[error] no more input. unterminated string or list? (continuing anyway)');
	}
	print('[warning] in Parser.parseExpression: unrecognized char "' + c + '"');
	print('this.pos = ' + this.pos);
	print('this.data.length = ' + this.data.length);
	print('this.rest = ' + this.rest());
    }
    this.consumeWhitespace();
    return value;
};

exports.Parser = Parser;