Konamiman/msx-basic-parser.rb

## msx-basic-parser.rb
# MSX BASIC tokenized file parser written in Ruby
# By Konamiman, http://www.konamiman.com
# This is my first Ruby thing, so improvement suggestions are highly welcome

def main
  check_parameters

  @file_bytes_enumerator = read_file_bytes.each
  @output_file = create_output_file

  discard_text_start_code

  until (parsed_line = get_and_parse_line).nil?
    @output_file ? @output_file.puts(parsed_line) : puts(parsed_line)
  end

  terminate
end

def check_parameters
  return if !ARGV.empty?

  puts <<~explanation
    MSX-BASIC tokenized file parser
    By Konamiman, 11/2017

    Usage: #{File.basename(__FILE__)} <input file> [<output file>]
    Result is dumped to STDOUT if <output file> is omitted

    Reference: MSX2 Technical Handbook, Chapter 2, Section 3.3
    http://www.konamiman.com/msx/msx2th/th-2.txt
  explanation

  terminate
end

def terminate(message = nil)
  @output_file.close if @output_file
  abort(message) if message
  exit(0)
end

def read_file_bytes
  file_name = ARGV[0]

  terminate("*** File '#{file_name}' not found!") if !File.exist?(file_name)

  begin
    file_bytes = File.open(file_name, 'rb', &:read).bytes
  rescue StandardError => ex
    terminate("*** Error when reading input file: #{ex.message}")
  end

  file_bytes
end

def create_output_file
  return nil if ARGV.length < 2

  begin
    return File.new(ARGV[1], 'w')
  rescue StandardError => ex
    terminate("*** Error when creating output file: #{ex.message}")
  end
end

def get_byte
  @file_bytes_enumerator.next
rescue StopIteration
  terminate('*** End of file reached before finding the end of program mark')
end

def peek_byte
  @file_bytes_enumerator.peek
end

def discard_text_start_code
  if (the_byte = get_byte) != 255
    terminate("*** Expected an initial 255 byte, got #{the_byte} instead")
  end
end

def get_and_parse_line
  next_line_pointer = extract_two_byte_integer
  return nil if next_line_pointer.zero?

  line_number = extract_two_byte_integer
  line_text = line_number.to_s + ' '

  inside_string = false
  end_of_line_found = false

  until end_of_line_found
    the_byte = get_byte

    if the_byte.zero?
      end_of_line_found = true
      next
    end

    if inside_string
      inside_string = false if the_byte == '"'
      line_text << the_byte
      next
    end

    case the_byte
    when '"' # start of string literal
      inside_string = true
      line_text << the_byte
    when 0xE6 # comes after '
    when 0x0B # octal number
      line_text << '&O' + extract_two_byte_integer.to_s(8)
    when 0x0C # hex number
      line_text << '&H' + format('%X', extract_two_byte_integer)
    when 0x0D # line number after RUN
      terminate('*** ERROR: Got an absolute line number address. This should happen only at runtime, not inside a BASIC file!')
    when 0x0E, 0x1C # line number before RUN, or integer between 256 and 32767
      line_text << extract_two_byte_integer.to_s
    when 0x0F # number from 10 to 255
      line_text << get_byte.to_s
    when 0x11..0x1A # line number from 0 to 9
      line_text << (the_byte - 0x11).to_s
    when 0x1D # single precision real
      line_text << parse_real_number((1..4).collect { get_byte })
    when 0x1F # double precision real
      line_text << parse_real_number((1..8).collect { get_byte })
    when 0x3A # colon or token prefixed with 3A
      next_byte = peek_byte
      if @tokens_prefixed_with_3A.key?(next_byte)
        line_text << @tokens_prefixed_with_3A[next_byte]
        get_byte
      else
        line_text << the_byte
      end
    when 0xFF # token prefixed with FF
      line_text << @tokens_prefixed_with_FF[get_byte]
    else
      line_text << if @single_byte_tokens.key?(the_byte)
        @single_byte_tokens[the_byte]
      else
        the_byte
      end
    end
  end

  line_text
end

def extract_two_byte_integer
  get_byte + 256 * get_byte
end

def parse_bcd(byte)
  [byte >> 4, byte & 0x0F]
end

def parse_real_number(number_bytes)
  # TODO: use exponential notation for very big or very small exponents

  exponent = number_bytes[0] - 64
  mantissa_bytes = number_bytes.drop(1)

  return '0' if exponent == -64

  parsed = ''
  if exponent <= 0
    extra_zeros_count = exponent.abs
    parsed = '0.' + ('0' * extra_zeros_count)
    mantissa_bytes.each do |mantissa_byte|
      bcd_digits = parse_bcd(mantissa_byte)
      parsed << bcd_digits.join('')
    end
    parsed.gsub!(/0+$/, '')
  else
    has_point = false
    mantissa_bytes.each do |mantissa_byte|
      bcd_digits = parse_bcd(mantissa_byte)
      bcd_digits.each do |d|
        parsed << d.to_s
        exponent -= 1
        if exponent.zero?
          has_point = true
          parsed << '.'
        end
      end
    end
    if has_point
      parsed.gsub!(/\.?0+$/, '')
    elsif exponent > 0
      parsed << '0' * exponent
    end
  end

  parsed
end

@single_byte_tokens = {
  0xE2 => 'ERR',
  0xEE => '>',
  0xBF => 'PAINT',
  0xEF => '=',
  0xA6 => 'ERROR',
  0xF0 => '<',
  0xF1 => '+',
  0xB1 => 'FIELD',
  0xC1 => 'PLAY',
  0xF2 => '-',
  0xB7 => 'FILES',
  0xED => 'POINT',
  0xF3 => '*',
  0x98 => 'POKE',
  0xF4 => '/',
  0xDE => 'FN',
  0xF5 => '^',
  0x82 => 'FOR',
  0xC3 => 'PRESET',
  0xFC => '\\',
  0x91 => 'PRINT',
  0xC2 => 'PSET',
  0xF6 => 'AND',
  0xB2 => 'GET',
  0xB3 => 'PUT',
  0x8D => 'GOSUB',
  0x87 => 'READ',
  0x89 => 'GOTO',
  0xE9 => 'ATTR$',
  0xAA => 'RENUM',
  0xA9 => 'AUTO',
  0x8B => 'IF',
  0x8C => 'RESTORE',
  0xC9 => 'BASE',
  0xFA => 'IMP',
  0xA7 => 'RESUME',
  0xC0 => 'BEEP',
  0xEC => 'INKEY$',
  0x8E => 'RETURN',
  0xCF => 'BLOAD',
  0x85 => 'INPUT',
  0xD0 => 'BSAVE',
  0xE5 => 'INSTR',
  0xB9 => 'RSET',
  0xCA => 'CALL',
  0x8A => 'RUN',
  0xD5 => 'IPL',
  0xBA => 'SAVE',
  0xCC => 'KEY',
  0xC5 => 'SCREEN',
  0xD4 => 'KILL',
  0xD2 => 'SET',
  0xBC => 'CIRCLE',
  0x92 => 'CLEAR',
  0x9B => 'CLOAD',
  0x88 => 'LET',
  0xC4 => 'SOUND',
  0xB4 => 'CLOSE',
  0xBB => 'LFILES',
  0x9F => 'CLS',
  0xAF => 'LINE',
  0xDF => 'SPC(',
  0xD7 => 'CMD',
  0x93 => 'LIST',
  0xC7 => 'SPRITE',
  0xBD => 'COLOR',
  0x9E => 'LLIST',
  0x99 => 'CONT',
  0xB5 => 'LOAD',
  0xDC => 'STEP',
  0xD6 => 'COPY',
  0xD8 => 'LOCATE',
  0x90 => 'STOP',
  0x9A => 'CSAVE',
  0xE8 => 'CSRLIN',
  0xE3 => 'STRING$',
  0x9D => 'LPRINT',
  0xA4 => 'SWAP',
  0xB8 => 'LSET',
  0xDB => 'TAB(',
  0xCD => 'MAX',
  0x84 => 'DATA',
  0xB6 => 'MERGE',
  0xDA => 'THEN',
  0x97 => 'DEF',
  0xCB => 'TIME',
  0xAE => 'DEFDBL',
  0xD9 => 'TO',
  0xAC => 'DEFINT',
  0xA3 => 'TROFF',
  0xAD => 'DEFSNG',
  0xA2 => 'TRON',
  0xAB => 'DEFSTR',
  0xFB => 'MOD',
  0xE4 => 'USING',
  0xA8 => 'DELETE',
  0xCE => 'MOTOR',
  0xDD => 'USR',
  0x86 => 'DIM',
  0xD3 => 'NAME',
  0xBE => 'DRAW',
  0x94 => 'NEW',
  0xE7 => 'VARPTR',
  0x83 => 'NEXT',
  0xC8 => 'VDP',
  0xEA => 'DSKI$',
  0xE0 => 'NOT',
  0xD1 => 'DSKO$',
  0xC6 => 'VPOKE',
  0xEB => 'OFF',
  0x96 => 'WAIT',
  0x81 => 'END',
  0x95 => 'ON',
  0xA0 => 'WIDTH',
  0xB0 => 'OPEN',
  0xF8 => 'XOR',
  0xF9 => 'EQV',
  0xF7 => 'OR',
  0xA5 => 'ERASE',
  0x9C => 'OUT',
  0xE1 => 'ERL',

  # Careful! MSX2 Technical Handbook says that REM has bytes 3A 8F,
  # but actually REM has just 8F and ' has 3A 8F E6
  0x8F => 'REM'
}

@tokens_prefixed_with_3A = {
  0x8F => "'",
  0xA1 => 'ELSE'
}

@tokens_prefixed_with_FF = {
  0xA4 => 'PDL',
  0x8B => 'EXP',
  0x97 => 'PEEK',
  0xA1 => 'FIX',
  0x91 => 'POS',
  0xA7 => 'FPOS',
  0x86 => 'ABS',
  0x8F => 'FRE',
  0x95 => 'ASC',
  0x8E => 'ATN',
  0x9B => 'HEX$',
  0x9D => 'BIN$',
  0x90 => 'INP',
  0x82 => 'RIGHT$',
  0x88 => 'RND',
  0x85 => 'INT',
  0xA0 => 'CDBL',
  0x96 => 'CHR$',
  0x9E => 'CINT',
  0x81 => 'LEFT$',
  0x84 => 'SGN',
  0x92 => 'LEN',
  0x89 => 'SIN',
  0x99 => 'SPACE$',
  0x87 => 'SQR',
  0xAC => 'LOC',
  0xA2 => 'STICK',
  0x8C => 'COS',
  0xAD => 'LOF',
  0x93 => 'STR$',
  0x9F => 'CSNG',
  0x8A => 'LOG',
  0xA3 => 'STRIG',
  0x9C => 'LPOS',
  0xAA => 'CVD',
  0xA8 => 'CVI',
  0xA9 => 'CVS',
  0x8D => 'TAN',
  0x83 => 'MID$',
  0xB0 => 'MKD$',
  0xAE => 'MKI$',
  0xAF => 'MKS$',
  0x94 => 'VAL',
  0xA6 => 'DSKF',
  0x98 => 'VPEEK',
  0x9A => 'OCT$',
  0xAB => 'EOF',
  0xA5 => 'PAD'
}

main
	# MSX BASIC tokenized file parser written in Ruby
	# By Konamiman, http://www.konamiman.com
	# This is my first Ruby thing, so improvement suggestions are highly welcome

	def main
	check_parameters

	@file_bytes_enumerator = read_file_bytes.each
	@output_file = create_output_file

	discard_text_start_code

	until (parsed_line = get_and_parse_line).nil?
	@output_file ? @output_file.puts(parsed_line) : puts(parsed_line)
	end

	terminate
	end

	def check_parameters
	return if !ARGV.empty?

	puts <<~explanation
	MSX-BASIC tokenized file parser
	By Konamiman, 11/2017

	Usage: #{File.basename(__FILE__)} <input file> [<output file>]
	Result is dumped to STDOUT if <output file> is omitted

	Reference: MSX2 Technical Handbook, Chapter 2, Section 3.3
	http://www.konamiman.com/msx/msx2th/th-2.txt
	explanation

	terminate
	end

	def terminate(message = nil)
	@output_file.close if @output_file
	abort(message) if message
	exit(0)
	end

	def read_file_bytes
	file_name = ARGV[0]

	terminate("*** File '#{file_name}' not found!") if !File.exist?(file_name)

	begin
	file_bytes = File.open(file_name, 'rb', &:read).bytes
	rescue StandardError => ex
	terminate("*** Error when reading input file: #{ex.message}")
	end

	file_bytes
	end

	def create_output_file
	return nil if ARGV.length < 2

	begin
	return File.new(ARGV[1], 'w')
	rescue StandardError => ex
	terminate("*** Error when creating output file: #{ex.message}")
	end
	end

	def get_byte
	@file_bytes_enumerator.next
	rescue StopIteration
	terminate('*** End of file reached before finding the end of program mark')
	end

	def peek_byte
	@file_bytes_enumerator.peek
	end

	def discard_text_start_code
	if (the_byte = get_byte) != 255
	terminate("*** Expected an initial 255 byte, got #{the_byte} instead")
	end
	end

	def get_and_parse_line
	next_line_pointer = extract_two_byte_integer
	return nil if next_line_pointer.zero?

	line_number = extract_two_byte_integer
	line_text = line_number.to_s + ' '

	inside_string = false
	end_of_line_found = false

	until end_of_line_found
	the_byte = get_byte

	if the_byte.zero?
	end_of_line_found = true
	next
	end

	if inside_string
	inside_string = false if the_byte == '"'
	line_text << the_byte
	next
	end

	case the_byte
	when '"' # start of string literal
	inside_string = true
	line_text << the_byte
	when 0xE6 # comes after '
	when 0x0B # octal number
	line_text << '&O' + extract_two_byte_integer.to_s(8)
	when 0x0C # hex number
	line_text << '&H' + format('%X', extract_two_byte_integer)
	when 0x0D # line number after RUN
	terminate('*** ERROR: Got an absolute line number address. This should happen only at runtime, not inside a BASIC file!')
	when 0x0E, 0x1C # line number before RUN, or integer between 256 and 32767
	line_text << extract_two_byte_integer.to_s
	when 0x0F # number from 10 to 255
	line_text << get_byte.to_s
	when 0x11..0x1A # line number from 0 to 9
	line_text << (the_byte - 0x11).to_s
	when 0x1D # single precision real
	line_text << parse_real_number((1..4).collect { get_byte })
	when 0x1F # double precision real
	line_text << parse_real_number((1..8).collect { get_byte })
	when 0x3A # colon or token prefixed with 3A
	next_byte = peek_byte
	if @tokens_prefixed_with_3A.key?(next_byte)
	line_text << @tokens_prefixed_with_3A[next_byte]
	get_byte
	else
	line_text << the_byte
	end
	when 0xFF # token prefixed with FF
	line_text << @tokens_prefixed_with_FF[get_byte]
	else
	line_text << if @single_byte_tokens.key?(the_byte)
	@single_byte_tokens[the_byte]
	else
	the_byte
	end
	end
	end

	line_text
	end

	def extract_two_byte_integer
	get_byte + 256 * get_byte
	end

	def parse_bcd(byte)
	[byte >> 4, byte & 0x0F]
	end

	def parse_real_number(number_bytes)
	# TODO: use exponential notation for very big or very small exponents

	exponent = number_bytes[0] - 64
	mantissa_bytes = number_bytes.drop(1)

	return '0' if exponent == -64

	parsed = ''
	if exponent <= 0
	extra_zeros_count = exponent.abs
	parsed = '0.' + ('0' * extra_zeros_count)
	mantissa_bytes.each do \|mantissa_byte\|
	bcd_digits = parse_bcd(mantissa_byte)
	parsed << bcd_digits.join('')
	end
	parsed.gsub!(/0+$/, '')
	else
	has_point = false
	mantissa_bytes.each do \|mantissa_byte\|
	bcd_digits = parse_bcd(mantissa_byte)
	bcd_digits.each do \|d\|
	parsed << d.to_s
	exponent -= 1
	if exponent.zero?
	has_point = true
	parsed << '.'
	end
	end
	end
	if has_point
	parsed.gsub!(/\.?0+$/, '')
	elsif exponent > 0
	parsed << '0' * exponent
	end
	end

	parsed
	end

	@single_byte_tokens = {
	0xE2 => 'ERR',
	0xEE => '>',
	0xBF => 'PAINT',
	0xEF => '=',
	0xA6 => 'ERROR',
	0xF0 => '<',
	0xF1 => '+',
	0xB1 => 'FIELD',
	0xC1 => 'PLAY',
	0xF2 => '-',
	0xB7 => 'FILES',
	0xED => 'POINT',
	0xF3 => '*',
	0x98 => 'POKE',
	0xF4 => '/',
	0xDE => 'FN',
	0xF5 => '^',
	0x82 => 'FOR',
	0xC3 => 'PRESET',
	0xFC => '\\',
	0x91 => 'PRINT',
	0xC2 => 'PSET',
	0xF6 => 'AND',
	0xB2 => 'GET',
	0xB3 => 'PUT',
	0x8D => 'GOSUB',
	0x87 => 'READ',
	0x89 => 'GOTO',
	0xE9 => 'ATTR$',
	0xAA => 'RENUM',
	0xA9 => 'AUTO',
	0x8B => 'IF',
	0x8C => 'RESTORE',
	0xC9 => 'BASE',
	0xFA => 'IMP',
	0xA7 => 'RESUME',
	0xC0 => 'BEEP',
	0xEC => 'INKEY$',
	0x8E => 'RETURN',
	0xCF => 'BLOAD',
	0x85 => 'INPUT',
	0xD0 => 'BSAVE',
	0xE5 => 'INSTR',
	0xB9 => 'RSET',
	0xCA => 'CALL',
	0x8A => 'RUN',
	0xD5 => 'IPL',
	0xBA => 'SAVE',
	0xCC => 'KEY',
	0xC5 => 'SCREEN',
	0xD4 => 'KILL',
	0xD2 => 'SET',
	0xBC => 'CIRCLE',
	0x92 => 'CLEAR',
	0x9B => 'CLOAD',
	0x88 => 'LET',
	0xC4 => 'SOUND',
	0xB4 => 'CLOSE',
	0xBB => 'LFILES',
	0x9F => 'CLS',
	0xAF => 'LINE',
	0xDF => 'SPC(',
	0xD7 => 'CMD',
	0x93 => 'LIST',
	0xC7 => 'SPRITE',
	0xBD => 'COLOR',
	0x9E => 'LLIST',
	0x99 => 'CONT',
	0xB5 => 'LOAD',
	0xDC => 'STEP',
	0xD6 => 'COPY',
	0xD8 => 'LOCATE',
	0x90 => 'STOP',
	0x9A => 'CSAVE',
	0xE8 => 'CSRLIN',
	0xE3 => 'STRING$',
	0x9D => 'LPRINT',
	0xA4 => 'SWAP',
	0xB8 => 'LSET',
	0xDB => 'TAB(',
	0xCD => 'MAX',
	0x84 => 'DATA',
	0xB6 => 'MERGE',
	0xDA => 'THEN',
	0x97 => 'DEF',
	0xCB => 'TIME',
	0xAE => 'DEFDBL',
	0xD9 => 'TO',
	0xAC => 'DEFINT',
	0xA3 => 'TROFF',
	0xAD => 'DEFSNG',
	0xA2 => 'TRON',
	0xAB => 'DEFSTR',
	0xFB => 'MOD',
	0xE4 => 'USING',
	0xA8 => 'DELETE',
	0xCE => 'MOTOR',
	0xDD => 'USR',
	0x86 => 'DIM',
	0xD3 => 'NAME',
	0xBE => 'DRAW',
	0x94 => 'NEW',
	0xE7 => 'VARPTR',
	0x83 => 'NEXT',
	0xC8 => 'VDP',
	0xEA => 'DSKI$',
	0xE0 => 'NOT',
	0xD1 => 'DSKO$',
	0xC6 => 'VPOKE',
	0xEB => 'OFF',
	0x96 => 'WAIT',
	0x81 => 'END',
	0x95 => 'ON',
	0xA0 => 'WIDTH',
	0xB0 => 'OPEN',
	0xF8 => 'XOR',
	0xF9 => 'EQV',
	0xF7 => 'OR',
	0xA5 => 'ERASE',
	0x9C => 'OUT',
	0xE1 => 'ERL',

	# Careful! MSX2 Technical Handbook says that REM has bytes 3A 8F,
	# but actually REM has just 8F and ' has 3A 8F E6
	0x8F => 'REM'
	}

	@tokens_prefixed_with_3A = {
	0x8F => "'",
	0xA1 => 'ELSE'
	}

	@tokens_prefixed_with_FF = {
	0xA4 => 'PDL',
	0x8B => 'EXP',
	0x97 => 'PEEK',
	0xA1 => 'FIX',
	0x91 => 'POS',
	0xA7 => 'FPOS',
	0x86 => 'ABS',
	0x8F => 'FRE',
	0x95 => 'ASC',
	0x8E => 'ATN',
	0x9B => 'HEX$',
	0x9D => 'BIN$',
	0x90 => 'INP',
	0x82 => 'RIGHT$',
	0x88 => 'RND',
	0x85 => 'INT',
	0xA0 => 'CDBL',
	0x96 => 'CHR$',
	0x9E => 'CINT',
	0x81 => 'LEFT$',
	0x84 => 'SGN',
	0x92 => 'LEN',
	0x89 => 'SIN',
	0x99 => 'SPACE$',
	0x87 => 'SQR',
	0xAC => 'LOC',
	0xA2 => 'STICK',
	0x8C => 'COS',
	0xAD => 'LOF',
	0x93 => 'STR$',
	0x9F => 'CSNG',
	0x8A => 'LOG',
	0xA3 => 'STRIG',
	0x9C => 'LPOS',
	0xAA => 'CVD',
	0xA8 => 'CVI',
	0xA9 => 'CVS',
	0x8D => 'TAN',
	0x83 => 'MID$',
	0xB0 => 'MKD$',
	0xAE => 'MKI$',
	0xAF => 'MKS$',
	0x94 => 'VAL',
	0xA6 => 'DSKF',
	0x98 => 'VPEEK',
	0x9A => 'OCT$',
	0xAB => 'EOF',
	0xA5 => 'PAD'
	}

	main