#!/usr/bin/env python ### decode_ms_basic.py -- Translate a Microsoft Quick Basic file to text. # 2008.05.04 Steve Witham # Web page: http://www.tiac.net/~sw/2008/05/Decode_MS_BASIC # Version 0.5 2008.05.09 # # This program is free for you to use, copy and modify. # It is offered as is, with no warranties. # # Based on information from "Structure of a Microsoft Basic Program" # http://www.mactech.com/articles/mactech/Vol.01/01.08/Structure/index.html # # Variable names up to 40 characters: # http://support.microsoft.com/kb/36737 MAX_SYMBOL_LEN = 40 # Floating Point Formats Used by Microsoft BASIC Products: # http://support.microsoft.com/kb/45166 # # LANGUAGE VERSION AND MACHINE DEPENDENCIES: # This program was developed to work with examples from the binary floating # point version "Quick Basic (b)" for the Mac (68k/PPC). Some of my # examples may have come from a PC circa 1985. There seem to be at least # two different file formats (distinguished by the first byte of the file). # # Also, this Python program was developed on a PPC. So, no guarantees about # IEEE vs. older MS binary vs. MS decimal formats, big vs. little endian # machines, etc. Not to mention other file format differences used by # older or newer Basics (or even more formats used by the same Basic!?) from sys import stdin, stdout, stderr, argv, exit from struct import unpack # For whenever I find out how Microsoft Binary Format floats show up. # This code is based on Bengt Richter's "bytes2float" # at http://tinyurl.com/4djsvh def MBF_double2float(s): # Convert a (normal) 8-byte float: if s == "\x00\x00\x00\x00\x00\x00\x7F\x00": return 0.0 sign = s[-2] >= '\x80' b = s[ : -1 ] + "\x00" acc = unpack( "= '\x80' b = s[ : -1 ] + "\x00" acc = unpack( "= '\x80': sign = "-" else: sign = "" exp = ( ord( s[0] ) & 0x7F ) - 65 digits = "".join( [ "%02x" % ord(c) for c in s[1:] ] ) if exp >= -4 and exp <= -2: hi_digits = "" lo_digits = "." + ( "0" * ( -1 - exp ) ) + digits exp = 0 elif exp >= -1 and exp <= len( digits ) - 1: hi_digits = digits[ : exp + 1 ] lo_digits = "." + digits[ exp + 1 : ] exp = 0 else: hi_digits = digits[ : 1 ] lo_digits = "." + digits[ 1 : ] for i in range( len(lo_digits) - 1, -1, -1 ): if lo_digits[i] != "0": break lo_digits = lo_digits[ : i+1 ] if lo_digits == ".": lo_digits = "" digits = sign + hi_digits + lo_digits if exp == 0: return digits if len(s) >= 8: return digits + ( "D%+03d" % exp ) else: return digits + ( "E%+03d" % exp ) def process_single_float( line ): global DECIMAL_MODE, MBF_MODE numbytes, line = chomp( line, 4 ) if DECIMAL_MODE: stdout.write( MS_decimal_float2str( numbytes ) ) else: if MBF_MODE: n = MBF_single2float( numbytes ) else: n = unpack( "f", numbytes )[0] stdout.write( str( n ) ) return line def process_double_float( line ): global DECIMAL_MODE, MBF_MODE numbytes, line = chomp( line, 8 ) if DECIMAL_MODE: stdout.write( MS_decimal_float2str( numbytes ) ) else: if MBF_MODE: n = MBF_double2float( numbytes ) else: n = unpack( "d", numbytes )[0] stdout.write( str( n ) ) return line def cook_tokens(): global TOKENS_RAW, TOKENS, MAX_SYMBOL_LEN TOKENS = {} for token_info in TOKENS_RAW: if len( token_info ) == 2: token, x = token_info target = TOKENS elif len( token_info ) == 3: token, y, x = token_info if not y in TOKENS: TOKENS[ y ] = {} if not type( TOKENS[ y ] ) == dict: stderr.write( str( token_info ) + '\n' ) stderr.write( "TOKENS" + str( [y] ) + " = " + repr(TOKENS[y]) + '\n' ) raise( "needs to be a dict" ) target = TOKENS[ y ] else: stderr.write( str( token_info ) + '\n' ) raise( "wrong number of parts" ) if x in target: stderr.write( str( token_info ) + '\n' ) stderr.write( "already have: " + repr( target[ x ] ) + '\n' ) raise( "duplicate entry" ) target[ x ] = token def ord_big_endian( s ): n = 0 for c in s: n = n * 256 + ord( c ) return n def chomp( s, n ): assert len( s ) >= n return s[ : n ], s[ n : ] def process_bytecode( byte, line ): bytecode = [ ] table = TOKENS while True: byte = ord( byte ) bytecode.append( byte ) if not byte in table: print print "Hex: [", [ "0x%02x" % c for c in bytecode], "]" print "Octal: [", [ "%03o" % c for c in bytecode], "]" raise( "Unknown bytecode: " + repr( bytecode ) ) lookedup = table[ byte ] if type( lookedup ) == str: return lookedup, line table = lookedup byte, line = chomp( line, 1 ) def decode_ms_basic_main(): # called at the bottom of the file global DECIMAL_MODE, MBF_MODE if len(argv) > 2 or ( len( argv ) > 1 and argv[1][0] == "-" ): stderr.write( "Usage: " + argv[0] + " [file_name]\n" ) exit(1) cook_tokens() if len( argv ) > 1: f = open( argv[1] ) else: f = stdin firstbyte = f.read(1) # BASIC version / format version? assert firstbyte != "" firstbyte = ord( firstbyte ) if not firstbyte & 0xF5 == 0xF1: print "REM First byte:", ( "0x%02x" % firstbyte ), firstbyte = ( firstbyte | 0xF1 ) & 0xFB print "--substituting", ( "0x%02x" % firstbyte ) if firstbyte & 0x08: goto_target_size = 3 else: goto_target_size = 4 if firstbyte & 0x02: DECIMAL_MODE = True else: DECIMAL_MODE = False MBF_MODE = False # Don't know which bit (?) indicates MBF. # Read the whole program into memory: lines = [] while True: line_len_bytes = f.read(2) assert len( line_len_bytes ) == 2 # end of file should have these line_len = ord_big_endian( line_len_bytes ) & 0x0FFF if line_len < 4: assert line_len_bytes == "\x00\x00" indent_count_byte = f.read(1) assert indent_count_byte == "\x00" break line = line_len_bytes + f.read( line_len - 2 ) assert len( line ) == line_len and line[ -1 ] == '\x00' lines.append( line[ : line_len - 1 ] ) # after 00 00 00 of last line, look for symbol table: symbols = [] c = f.read(1) if c != "": if ord( c ) > MAX_SYMBOL_LEN: funky_char = c # Seen this, don't know what for. c = f.read(1) # Also seen extra char '\x01'. while c != "": sym_len = ord(c) assert sym_len >=1 and sym_len <= MAX_SYMBOL_LEN if symbols == []: # At beginning of table, was supposed sym_len actually funky_char # and next char actually the sym_len? Note MAX_SYMBOL_LEN < 'A'. sym = f.read(1) if sym != "" and sym < 'A': funky_char = c sym_len = ord( sym ) assert sym_len >=1 and sym_len <= MAX_SYMBOL_LEN sym = f.read( sym_len ) else: if sym_len > 1: sym += f.read( sym_len - 1 ) else: sym = f.read( sym_len ) assert len( sym ) == sym_len symbols.append( sym ) c = f.read(1) # Now decode the program: for line in lines: line_len_bytes, line = chomp( line, 2 ) line_len = ord_big_endian( line_len_bytes ) has_line_no = ( line_len >= 0x8000 ) line_len &= 0x0FFF indent_count_byte, line = chomp( line, 1 ) if has_line_no: line_no_bytes, line = chomp( line, 2 ) line_no = ord_big_endian( line_no_bytes ) stdout.write( str(line_no) + " " ) if indent_count_byte != '\x00': # Hmm, are text line labels supposed to be indented? stdout.write( ' ' * ord(indent_count_byte) ) in_comment = False while line != "": byte, line = chomp( line, 1 ) if byte == '\x01': # Symbol-table variable reference symbytes, line = chomp( line, 2 ) symno = ord_big_endian( symbytes ) assert symno < len( symbols ) stdout.write( symbols[ symno ] ) elif byte == '\x02': # Symbol-table line label, 2 bytes: symbytes, line = chomp( line, 2 ) symno = ord_big_endian( symbytes ) assert symno < len( symbols ) stdout.write( symbols[ symno ] ) elif byte == '\x03': # GOTO target from sym table, but more bytes: symbytes, line = chomp( line, goto_target_size ) symno = ord_big_endian( symbytes ) assert symno < len( symbols ) stdout.write( symbols[ symno ] ) elif byte == '\x08': # THEN or ELSE at end of line-- +location? numbytes, line = chomp( line, goto_target_size ) # nothing to print elif byte == '\x0C': # Two-byte int (??) -- unsigned? numbytes, line = chomp( line, 2 ) stdout.write( str( ord_big_endian( numbytes ) ) ) elif byte == '\x0E': # line number: three- or four-byte int: numbytes, line = chomp( line, goto_target_size ) stdout.write( str( ord_big_endian( numbytes ) ) ) elif byte == '\x0F': # number in next byte -- unsigned?: numbyte, line = chomp( line, 1 ) stdout.write( str( ord(numbyte) ) ) elif byte >= '\x11' and byte <= '\x1A': # 0..9 in this byte: stdout.write( str( ord( byte ) - 0x11 ) ) elif byte == '\x1B': # prefix for a line label: numbytes, line = chomp( line, goto_target_size ) assert ord_big_endian( numbytes ) == 0 # No action, just some printable characters next. elif byte == '\x1C': # two-byte int -- assuming unsigned--?? numbytes, line = chomp( line, 2 ) stdout.write( str( ord_big_endian( numbytes ) ) ) elif byte == '\x1D': # IEEE or decimal float, 4 bytes: line = process_single_float( line ) elif byte == '\x1E': # Four-byte int -- assuming unsigned --?? numbytes, line = chomp( line, 4 ) stdout.write( str( ord_big_endian( numbytes ) ) ) elif byte == '\x1F': # IEEE or decimal double, 8 bytes: line = process_double_float( line ) elif ord(byte) < 0x20: # Something unknown: stdout.write( repr(byte)[1:-1] ) elif ord(byte) < 0x80 or in_comment: # Printable character: if byte == ":" and line.startswith( "\xAF\xE8" ): # ":REM'" # BASIC internally codes ' as :REM' -- undo that: line = line[ 2: ] byte = "'" in_comment = True stdout.write( byte ) else: token, line = process_bytecode( byte, line ) stdout.write( token ) if token == "REM": in_comment = True elif token == "WHILE" and line != "": # Skip + after WHILE: byte = ord( line[0] ) if byte in TOKENS and TOKENS[byte] == "+": line = line[ 1: ] # end of line print TOKENS_RAW = [ ['=', 234], ["'", 232], ['+', 236], ['-', 237], ['*', 238], ['/', 239], ['^', 240], ['>', 233], ['<', 235], ['\\', 247], ['AND', 241], ['ABS', 128], ['ALL', 249, 255], ['APPEND', 249, 254], ['ASC', 129], ['AS', 249, 253], ['ATN', 130], ['AUTO', 248, 128], ['ABOUT', 249, 243], ['BASE', 249, 252], ['BEEP', 248, 181], ['BREAK', 249, 245], ['BUTTON', 248, 193], ['BACKPAT', 251, 255], ['CALL', 131], ['CDBL', 132], ['CHAIN', 248, 129], ['CHR$', 133], ['CINT', 134], ['CLEAR', 248, 130], ['CLOSE', 135], ['CLS', 248, 131], ['COMMON', 136], ['CONT', 248, 132], ['COS', 137], ['CSNG', 248, 133], ['CVD', 138], ['CVI', 139], ['CVS', 140], ['CIRCLE', 248, 182], ['CSRLIN', 248, 198], ['CVSBCD', 248, 205], ['CVDBCD', 248, 206], ['CLNG', 193], ['CVL', 194], ['COLOR', 248, 217], ['CHDIR', 248, 221], ['CASE', 248, 224], ['DATA', 141], ['DATE$', 248, 134], ['DEFINT', 248, 135], ['DEFSNG', 248, 136], ['DEFDBL', 248, 137], ['DEFSTR', 248, 138], ['DEF', 248, 139], ['DELETE', 248, 140], ['DIM', 248, 141], ['DIALOG', 248, 196], ['DEFLNG', 248, 214], ['EQV', 244], ['EDIT', 248, 142], ['ELSE', 142], ['END', 248, 143], ['EOF', 143], ['ERASE', 248, 144], ['ERL', 248, 145], ['ERROR', 248, 146], ['ERR', 248, 147], ['EXP', 144], ['EXIT', 248, 191], ['ELSEIF', 192], ['ERASERECT', 251, 233], ['ERASEOVAL', 251, 228], ['ERASEROUNDRECT', 251, 223], ['ERASEARC', 251, 218], ['ERASEPOLY', 251, 212], ['FIELD', 145], ['FILES', 248, 148], ['FIX', 146], ['FN', 147], ['FOR', 148], ['FRE', 248, 149], ['FRAMERECT', 251, 235], ['FILLRECT', 251, 231], ['FRAMEOVAL', 251, 230], ['FILLOVAL', 251, 226], ['FRAMEROUNDRECT', 251, 225], ['FILLROUNDRECT', 251, 221], ['FRAMEARC', 251, 220], ['FILLARC', 251, 216], ['FRAMEPOLY', 251, 214], ['FILLPOLY', 251, 210], ['GET', 149], ['GOSUB', 150], ['GOTO', 151], ['GETPEN', 251, 247], ['HEX$', 248, 150], ['HIDECURSOR', 251, 252], ['HIDEPEN', 251, 249], ['IMP', 245], ['IF', 152], ['INKEY$', 153], ['INPUT', 154], ['INSTR', 248, 151], ['INT', 155], ['IS', 249, 242], ['INITCURSOR', 251, 254], ['INVERTRECT', 251, 232], ['INVERTOVAL', 251, 227], ['INVERTROUNDRECT', 251, 222], ['INVERTARC', 251, 217], ['INVERTPOLY', 251, 211], ['KILL', 248, 152], ['LEFT$', 156], ['LEN', 157], ['LET', 158], ['LINE', 159], ['LIST', 248, 153], ['LLIST', 248, 154], ['LOAD', 248, 155], ['LOC', 161], ['LOF', 162], ['LOG', 163], ['LPOS', 248, 156], ['LPRINT', 248, 157], ['LSET', 164], ['LCOPY', 248, 183], ['LOCATE', 248, 197], ['LBOUND', 248, 199], ['LIBRARY', 248, 204], ['LINETO', 251, 240], ['MOD', 246], ['MERGE', 248, 158], ['MID$', 165], ['MKD$', 166], ['MKI$', 167], ['MKS$', 168], ['MOUSE', 248, 184], ['MENU', 248, 194], ['MKSBCD$', 248, 207], ['MKDBCD$', 248, 208], ['MKL$', 195], ['MOVETO', 251, 242], ['MOVE', 251, 241], ['NAME', 248, 159], ['NEW', 248, 160], ['NEXT', 169], ['NOT', 231], ['OR', 242], ['OCT$', 248, 161], ['ON', 170], ['OPEN', 171], ['OPTION', 248, 162], ['OUTPUT', 249, 251], ['OFF', 249, 244], ['OBSCURECURSOR', 251, 250], ['PEEK', 248, 163], ['POKE', 248, 164], ['POS', 248, 165], ['PRINT', 172], ['PUT', 173], ['POINT', 248, 185], ['PRESET', 248, 186], ['PSET', 248, 187], ['PALETTE', 248, 219], ['PRINTDIALOG', 248, 225], ['PENSIZE', 251, 246], ['PENMODE', 251, 245], ['PENPAT', 251, 244], ['PENNORMAL', 251, 243], ['PAINTRECT', 251, 234], ['PAINTOVAL', 251, 229], ['PAINTROUNDRECT', 251, 224], ['PAINTARC', 251, 219], ['PTAB', 251, 215], ['PAINTPOLY', 251, 213], ['PICTURE', 250, 128], ['POKEW', 250, 130], ['POKEL', 250, 131], ['PEEKW', 250, 132], ['PEEKL', 250, 133], ['RANDOMIZE', 248, 166], ['READ', 174], ['REM', 175], ['RENUM', 248, 167], ['RESTORE', 248, 168], ['RESUME', 248, 169], ['RETURN', 176], ['RIGHT$', 177], ['RND', 178], ['RSET', 179], ['RUN', 248, 170], ['RESET', 248, 188], ['SAVE', 248, 171], ['SGN', 180], ['SHELL', 248, 172], ['SIN', 181], ['SPACE$', 182], ['SPC', 249, 250], ['SQR', 183], ['STEP', 249, 249], ['STOP', 248, 173], ['STR$', 184], ['STRING$', 185], ['SWAP', 248, 174], ['SYSTEM', 248, 175], ['SUB', 248, 190], ['STATIC', 227], ['SOUND', 248, 192], ['SHARED', 248, 201], ['SCROLL', 248, 203], ['SADD', 248, 215], ['SCROLLBAR', 248, 226], ['SELECT', 248, 227], ['SETCURSOR', 251, 253], ['SHOWCURSOR', 251, 251], ['SHOWPEN', 251, 248], ['TAB', 249, 248], ['TAN', 186], ['THEN', 230], ['TIME$', 248, 176], ['TO', 229], ['TRON', 248, 177], ['TROFF', 248, 178], ['TIMER', 248, 189], ['TEXTFONT', 251, 239], ['TEXTFACE', 251, 238], ['TEXTMODE', 251, 237], ['TEXTSIZE', 251, 236], ['TESETTEXT', 251, 209], ['TESETSELECT', 251, 208], ['TESCROLL', 251, 207], ['TEKEY', 251, 206], ['TEDELETE', 251, 205], ['TEINSERT', 251, 204], ['TEACTIVATE', 251, 203], ['TEDEACTIVATE', 251, 202], ['TEUPDATE', 251, 201], ['TECALTEXT', 251, 200], ['USING', 228], ['USR', 249, 247], ['UBOUND', 248, 200], ['UCASE$', 248, 202], ['VAL', 188], ['VARPTR', 248, 179], ['WAIT', 249, 246], ['WEND', 189], ['WHILE', 190], ['WIDTH', 248, 180], ['WRITE', 191], ['WINDOW', 248, 195], ['WAVE', 250, 129], ['XOR', 243], ["", 248, 209 ] # Seems to appear between CALL-less subr name and parenthesized arg. ] decode_ms_basic_main()