Replace command splitter with tokenizer parser

This commit is contained in:
Sami Samhuri 2026-02-07 14:57:09 -08:00
parent 6a29c97314
commit beee5deca5
No known key found for this signature in database
3 changed files with 365 additions and 64 deletions

View file

@ -1,76 +1,368 @@
require "shell/quote_cursor"
module Shell
class StringParser
class << self
def split_commands(line)
commands = []
command = +""
cursor = QuoteCursor.new
next_op = :always
i = 0
Token = Struct.new(:type, :value, keyword_init: true)
while i < line.length
c = line[i]
if cursor.unquoted?
case c
when ";"
commands << {command: command, op: next_op}
command = +""
next_op = :always
i += 1
next
when "&"
if line[i + 1] == "&"
if command.strip.empty?
raise ArgumentError, "syntax error near unexpected token `&&`"
end
commands << {command: command, op: next_op}
command = +""
next_op = :and
i += 2
next
end
end
end
segment, i = cursor.consume(line, i)
command << segment
end
if next_op == :and && command.strip.empty?
raise ArgumentError, "syntax error: expected command after `&&`"
end
commands << {command: command, op: next_op}
commands
class Scanner
def initialize(line, index: 0)
@line = line
@index = index
end
def read_dollar_paren(line, start_index)
output = +""
i = start_index
depth = 1
cursor = QuoteCursor.new
attr_reader :index
while i < line.length
c = line[i]
def tokenize_command_list
tokens = []
segment_start = index
if cursor.unquoted?
case c
when "("
depth += 1
when ")"
depth -= 1
return [output, i + 1] if depth.zero?
end
until eof?
c = current_char
if c == ";"
tokens << Token.new(type: :text, value: @line[segment_start...index])
tokens << Token.new(type: :separator, value: :always)
advance
segment_start = index
next
end
segment, i = cursor.consume(line, i)
output << segment
if c == "&" && peek(1) == "&"
tokens << Token.new(type: :text, value: @line[segment_start...index])
tokens << Token.new(type: :separator, value: :and)
advance(2)
segment_start = index
next
end
case c
when "\\"
advance_escape
when "'"
skip_single_quoted
when "\""
skip_double_quoted
when "`"
skip_backtick
when "$"
if peek(1) == "("
if peek(2) == "("
skip_arithmetic_substitution
else
skip_command_substitution
end
else
advance
end
else
advance
end
end
tokens << Token.new(type: :text, value: @line[segment_start...index])
tokens
end
# Reads the contents and end-index for a command substitution body, where
# index points to the first character after "$(".
def read_dollar_paren_body
output = +""
depth = 1
until eof?
c = current_char
if c == "\\"
output << read_escape
next
end
if c == "'"
output << read_single_quoted
next
end
if c == "\""
output << read_double_quoted
next
end
if c == "`"
output << read_backtick
next
end
if c == "$" && peek(1) == "("
if peek(2) == "("
output << read_arithmetic_substitution
else
output << "$("
advance(2)
depth += 1
end
next
end
if c == "("
output << c
depth += 1
advance
next
end
if c == ")"
depth -= 1
if depth.zero?
return [output, index + 1]
end
output << c
advance
next
end
output << c
advance
end
raise ArgumentError, "Unmatched $(...)"
end
private
def eof?
index >= @line.length
end
def current_char
@line[index]
end
def peek(offset)
@line[index + offset]
end
def advance(count = 1)
@index += count
end
def advance_escape
advance
advance unless eof?
end
def skip_single_quoted
advance # opening quote
until eof?
if current_char == "'"
advance
return
end
advance
end
raise ArgumentError, "Unmatched quote"
end
def skip_double_quoted
advance # opening quote
until eof?
c = current_char
case c
when "\\"
advance_escape
when "\""
advance
return
when "`"
skip_backtick
when "$"
if peek(1) == "("
if peek(2) == "("
skip_arithmetic_substitution
else
skip_command_substitution
end
else
advance
end
else
advance
end
end
raise ArgumentError, "Unmatched quote"
end
def skip_backtick
advance # opening backtick
until eof?
c = current_char
case c
when "\\"
advance_escape
when "`"
advance
return
when "$"
if peek(1) == "("
if peek(2) == "("
skip_arithmetic_substitution
else
skip_command_substitution
end
else
advance
end
else
advance
end
end
raise ArgumentError, "Unmatched backtick"
end
def skip_command_substitution
advance(2) # consume "$("
depth = 1
until eof?
c = current_char
case c
when "\\"
advance_escape
when "'"
skip_single_quoted
when "\""
skip_double_quoted
when "`"
skip_backtick
when "$"
if peek(1) == "("
if peek(2) == "("
skip_arithmetic_substitution
else
advance(2)
depth += 1
end
else
advance
end
when "("
advance
depth += 1
when ")"
advance
depth -= 1
return if depth.zero?
else
advance
end
end
raise ArgumentError, "Unmatched $(...)"
end
def skip_arithmetic_substitution
advance(3) # consume "$(("
depth = 1
until eof?
c = current_char
case c
when "\\"
advance_escape
when "'"
skip_single_quoted
when "\""
skip_double_quoted
when "`"
skip_backtick
when "$"
if peek(1) == "("
if peek(2) == "("
advance(3)
depth += 1
else
skip_command_substitution
end
else
advance
end
when ")"
if peek(1) == ")"
advance(2)
depth -= 1
return if depth.zero?
else
advance
end
else
advance
end
end
raise ArgumentError, "Unmatched $((...))"
end
def read_escape
start = index
advance_escape
@line[start...index]
end
def read_single_quoted
start = index
skip_single_quoted
@line[start...index]
end
def read_double_quoted
start = index
skip_double_quoted
@line[start...index]
end
def read_backtick
start = index
skip_backtick
@line[start...index]
end
def read_arithmetic_substitution
start = index
skip_arithmetic_substitution
@line[start...index]
end
end
class << self
def split_commands(line)
commands = []
next_op = :always
tokens = Scanner.new(line).tokenize_command_list
tokens.each do |token|
case token.type
when :text
commands << {command: token.value, op: next_op}
if next_op == :and && token.value.strip.empty?
raise ArgumentError, "syntax error: expected command after `&&`"
end
next_op = :always
when :separator
if token.value == :and
if commands.empty? || commands.last[:command].strip.empty?
raise ArgumentError, "syntax error near unexpected token `&&`"
end
next_op = :and
else
next_op = :always
end
else
raise ArgumentError, "Unknown token type: #{token.type}"
end
end
commands
end
def read_dollar_paren(line, start_index)
Scanner.new(line, index: start_index).read_dollar_paren_body
end
end
end
end

View file

@ -1,4 +1,3 @@
require "shellwords"
require "open3"
require "shell/quote_cursor"
require "shell/string_parser"
@ -54,12 +53,12 @@ module Shell
end
end
# Lifted directly from Ruby 4.0.0.
# Adapted from Ruby's Shellwords splitting logic.
#
# Splits a string into an array of tokens in the same way the UNIX
# Bourne shell does.
#
# argv = Shellwords.split('here are "two words"')
# argv = shellsplit('here are "two words"')
# argv #=> ["here", "are", "two words"]
#
# +line+ must not contain NUL characters because of nature of
@ -69,7 +68,7 @@ module Shell
# metacharacters except for the single and double quotes and
# backslash are not treated as such.
#
# argv = Shellwords.split('ruby my_prog.rb | less')
# argv = shellsplit('ruby my_prog.rb | less')
# argv #=> ["ruby", "my_prog.rb", "|", "less"]
#
# String#shellsplit is a shortcut for this function.

View file

@ -93,6 +93,16 @@ class ShellTest < Minitest::Test
assert_equal "hi", `#{A1_PATH} -c 'echo $(echo hi)'`.chomp
end
def test_keeps_control_operators_inside_command_substitution
semicolon_stdout, semicolon_stderr, semicolon_status = Open3.capture3(A1_PATH, "-c", "echo $(echo hi; echo bye)")
assert semicolon_status.success?, semicolon_stderr
assert_equal "hi bye\n", semicolon_stdout
and_stdout, and_stderr, and_status = Open3.capture3(A1_PATH, "-c", "echo $(echo hi && echo bye)")
assert and_status.success?, and_stderr
assert_equal "hi bye\n", and_stdout
end
def test_expands_command_substitution_with_escaped_quote
assert_equal "a\"b", `#{A1_PATH} -c 'echo $(printf \"%s\" \"a\\\"b\")'`.chomp
end