use ascii; use bufio; use encoding::utf8; use fmt; use io; use sort; use strconv; use strings; use strio; use types; export type lexer = struct { in: *io::stream, path: str, loc: (uint, uint), un: (token | void), rb: [2](rune | io::EOF | void), flags: flags, comment: str, }; // Flags which apply to this lexer export type flags = enum uint { // Enables lexing comments COMMENTS = 1 << 0, }; // A syntax error export type syntax = (location, str)!; // All possible lexer errors export type error = (io::error | syntax)!; // Returns a human-friendly string for a given error export fn strerror(err: error) const str = { static let buf: [2048]u8 = [0...]; return match (err) { err: io::error => io::strerror(err), s: syntax => fmt::bsprintf(buf, "{}:{},{}: Syntax error: {}", s.0.path, s.0.line, s.0.col, s.1), }; }; // Initializes a new lexer for the given input stream. The path is borrowed. export fn init(in: *io::stream, path: str, flags: flags...) lexer = { let f: flags = 0: flags; for (let i = 0z; i < len(flags); i += 1) { f |= flags[i]; }; return lexer { in = in, path = path, loc = (1, 1), un = void, rb = [void...], flags = f, comment = "", }; }; // Returns the current value of the comment buffer, or empty string if unset (or // if [[flags::COMMENTS]] was not enabled for this lexer). export fn comment(lex: *lexer) str = lex.comment; // Returns the next token from the lexer. export fn lex(lex: *lexer) (token | error) = { match (lex.un) { tok: token => { lex.un = void; return tok; }, _: void => void, }; let loc = location { ... }; let r: rune = match (nextw(lex)?) { _: io::EOF => return (ltok::EOF, void, mkloc(lex)), r: (rune, location) => { loc = r.1; r.0; }, }; if (is_name(r, false)) { unget(lex, r); return lex_name(lex, loc, true); }; if (ascii::isdigit(r)) { unget(lex, r); return lex_literal(lex, loc); }; let tok: ltok = switch (r) { * => return syntaxerr(loc, "invalid character"), '"', '\'' => { unget(lex, r); return lex_rn_str(lex, loc); }, '.', '<', '>' => return lex3(lex, loc, r), '^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => { return lex2(lex, loc, r); }, '~' => ltok::BNOT, ',' => ltok::COMMA, '{' => ltok::LBRACE, '[' => ltok::LBRACKET, '(' => ltok::LPAREN, '}' => ltok::RBRACE, ']' => ltok::RBRACKET, ')' => ltok::RPAREN, ';' => ltok::SEMICOLON, '?' => ltok::QUESTION, }; return (tok, void, loc); }; fn is_name(r: rune, num: bool) bool = ascii::isalpha(r) || r == '_' || r == '@' || (num && ascii::isdigit(r)); fn ncmp(a: const *void, b: const *void) int = { let a = a: const *str, b = b: const *str; return match (ascii::strcmp(*a, *b)) { _: void => abort("non-ascii name"), // TODO: Bubble me up i: int => i, }; }; fn lex_unicode(lex: *lexer, loc: location, n: size) (rune | error) = { assert(n < 9); let buf: [9]u8 = [0...]; for (let i = 0z; i < n; i += 1z) { let r = match (next(lex)?) { _: io::EOF => return syntaxerr(loc, "unexpected EOF scanning for escape"), r: rune => r, }; if (!ascii::isxdigit(r)) { return syntaxerr(loc, "unexpected rune scanning for escape"); }; buf[i] = r: u32: u8; }; let s = strings::fromutf8_unsafe(buf[..n]); return strconv::stou32b(s, strconv::base::HEX) as u32: rune; }; fn lex_rune(lex: *lexer, loc: location) (rune | error) = { let r = match (next(lex)?) { _: io::EOF => return syntaxerr(loc, "unexpected EOF scanning for rune"), r: rune => r, }; if (r != '\\') { return r; }; r = match (next(lex)?) { _: io::EOF => return syntaxerr(loc, "unexpected EOF scanning for escape"), r: rune => r, }; return switch (r) { '\\' => '\\', '\'' => '\'', '0' => '\0', 'a' => '\a', 'b' => '\b', 'f' => '\f', 'n' => '\n', 'r' => '\r', 't' => '\t', 'v' => '\v', '"' => '\"', 'x' => lex_unicode(lex, loc, 2), 'u' => lex_unicode(lex, loc, 4), 'U' => lex_unicode(lex, loc, 8), }; }; fn lex_string(lex: *lexer, loc: location) (token | error) = { let buf = strio::dynamic(); for (true) match (next(lex)?) { _: io::EOF => return syntaxerr(loc, "unexpected EOF scanning string literal"), r: rune => if (r == '"') break else { unget(lex, r); r = lex_rune(lex, loc)?; strio::appendrune(buf, r); }, }; match (nextw(lex)?) { _: io::EOF => void, r: (rune, location) => { const r = r.0; if (r == '"') { const tok = lex_string(lex, loc)?; const next = tok.1 as str; strio::concat(buf, next); free(next); } else { unget(lex, r); }; }, }; return (ltok::LIT_STR, strio::finish(buf), loc); }; fn lex_rn_str(lex: *lexer, loc: location) (token | error) = { let r = match (next(lex)) { r: rune => r, _: (io::EOF | io::error) => abort(), }; switch (r) { '\"' => return lex_string(lex, loc), '\'' => void, * => abort(), // Invariant }; // Rune literal let ret: token = (ltok::LIT_RUNE, lex_rune(lex, loc)?, loc); match (next(lex)?) { _: io::EOF => return syntaxerr(loc, "unexpected EOF"), n: rune => if (n != '\'') return syntaxerr(loc, "expected \"\'\""), }; return ret; }; fn lex_name(lex: *lexer, loc: location, keyword: bool) (token | error) = { let buf = strio::dynamic(); match (next(lex)) { r: rune => { assert(is_name(r, false)); strio::appendrune(buf, r); }, _: (io::EOF | io::error) => abort(), }; for (true) match (next(lex)?) { _: io::EOF => break, r: rune => { if (!is_name(r, true)) { unget(lex, r); break; }; strio::appendrune(buf, r); }, }; let n = strio::finish(buf); if (!keyword) { return (ltok::NAME, n, loc); }; return match (sort::search(bmap[..ltok::LAST_KEYWORD+1], size(str), &n, &ncmp)) { null => (ltok::NAME, n, loc), v: *void => { defer free(n); let tok = v: uintptr - &bmap[0]: uintptr; tok /= size(str): uintptr; (tok: ltok, void, loc); }, }; }; fn lex_comment(lexr: *lexer, loc: location) (token | error) = { if (lexr.flags & flags::COMMENTS != flags::COMMENTS) { for (true) match (next(lexr)?) { _: io::EOF => break, r: rune => if (r == '\n') break, }; return lex(lexr); }; let buf = strio::dynamic(); defer io::close(buf); for (true) match (next(lexr)?) { _: io::EOF => break, r: rune => { strio::appendrune(buf, r); if (r == '\n') break; }, }; let new = strings::concat(lexr.comment, strio::string(buf)); free(lexr.comment); lexr.comment = new; return lex(lexr); }; fn lex_literal(lex: *lexer, loc: location) (token | error) = { let chars: []u8 = []; let r = match (next(lex)?) { _: io::EOF => return (ltok::EOF, void, loc), r: rune => r, }; if (r == '-') { append(chars, utf8::encoderune(r)...); r = match (next(lex)?) { _: io::EOF => return (ltok::EOF, void, loc), r: rune => r, }; }; let base = 10u; if (r == '0') { append(chars, utf8::encoderune(r)...); r = match (next(lex)?) { _: io::EOF => return (ltok::LIT_ICONST, 0i64, loc), r: rune => r, }; switch (r) { 'b' => base = 2, 'o' => base = 8, 'x' => base = 16, * => unget(lex, r), }; } else unget(lex, r); let basechrs = switch (base) { 2 => "01", 8 => "01234567", 10 => "0123456789", 16 => "0123456789ABCDEFabcdef", }; let suff: (size | void) = void; let exp: (size | void) = void; let end = 0z; let float = false; for (true) { r = match (next(lex)?) { _: io::EOF => break, r: rune => r, }; if (!strings::contains(basechrs, r)) switch (r) { '.' => if (float || exp is size || suff is size || base != 10) { unget(lex, r); break; } else { r = match (next(lex)?) { _: io::EOF => break, r: rune => r, }; if (!strings::contains(basechrs, r)) { unget(lex, r); unget(lex, '.'); break; }; unget(lex, r); float = true; append(chars, utf8::encoderune('.')...); }, 'e' => if (exp is size || suff is size || base != 10) { unget(lex, r); break; } else { if (end == 0) end = len(chars); append(chars, utf8::encoderune(r)...); exp = len(chars); }, 'i', 'u', 'f', 'z' => if (suff is size) { unget(lex, r); break; } else { suff = len(chars); if (end == 0) end = len(chars); append(chars, utf8::encoderune(r)...); basechrs = "0123456789"; }, * => { unget(lex, r); break; }, } else append(chars, utf8::encoderune(r)...); }; if (end == 0) end = len(chars); let exp = match (exp) { _: void => "0", exp: size => { let end = match (suff) { _: void => len(chars), suff: size => suff, }; strings::fromutf8(chars[exp..end]); }, }; let exp = match (strconv::stoz(exp)) { exp: size => exp, _: strconv::invalid => abort(), // Shouldn't be lexed in _: strconv::overflow => return syntaxerr(loc, "overflow in exponent"), }; let suff = match (suff) { suff: size => strings::fromutf8(chars[suff..]), _: void => "", }; let suff = if (suff == "u8") ltok::LIT_U8 else if (suff == "u16") ltok::LIT_U16 else if (suff == "u32") ltok::LIT_U32 else if (suff == "u64") ltok::LIT_U64 else if (suff == "u") ltok::LIT_UINT else if (suff == "z") ltok::LIT_SIZE else if (suff == "i8") ltok::LIT_I8 else if (suff == "i16") ltok::LIT_I16 else if (suff == "i32") ltok::LIT_I32 else if (suff == "i64") ltok::LIT_I64 else if (suff == "i") ltok::LIT_INT else if (suff == "" && !float) ltok::LIT_ICONST else if (suff == "f32") ltok::LIT_F32 else if (suff == "f64") ltok::LIT_F64 else if (suff == "" && float) ltok::LIT_FCONST else return syntaxerr(loc, "invalid literal suffix"); let val = strings::fromutf8(chars[..end]); let val = switch (suff) { ltok::LIT_U8, ltok::LIT_U16, ltok::LIT_U32, ltok::LIT_U64, ltok::LIT_UINT, ltok::LIT_SIZE => strconv::stou64b(val, base), ltok::LIT_ICONST => match (strconv::stoi64b(val, base)) { i: i64 => i, _: strconv::invalid => abort(), _: strconv::overflow => if (chars[0] != '-': u32: u8) { suff = ltok::LIT_U64; strconv::stou64b(val, base); } else strconv::overflow, }, ltok::LIT_I8, ltok::LIT_I16, ltok::LIT_I32, ltok::LIT_I64, ltok::LIT_INT => strconv::stoi64b(val, base), ltok::LIT_F32, ltok::LIT_F64, ltok::LIT_FCONST => abort(), // TODO }; let val = match (val) { val: u64 => { for (let i = 0z; i < exp; i += 1) { val *= 10; }; val; }, val: i64 => { for (let i = 0z; i < exp; i += 1) { val *= 10; }; val; }, _: strconv::invalid => abort(), // Shouldn't be lexed in _: strconv::overflow => return syntaxerr(loc, "overflow in exponent"), }; return (suff, val, loc); }; fn lex2(lexr: *lexer, loc: location, r: rune) (token | error) = { let n = next(lexr)?; let tok: ltok = switch (r) { '^' => match (n) { r: rune => switch (r) { '^' => return (ltok::LXOR, void, loc), '=' => return (ltok::BXOREQ, void, loc), * => ltok::BXOR, }, _: io::EOF => ltok::BXOR, }, '*' => match (n) { r: rune => switch (r) { '=' => return (ltok::TIMESEQ, void, loc), * => ltok::TIMES, }, _: io::EOF => ltok::TIMES, }, '/' => match (n) { r: rune => switch (r) { '=' => return (ltok::DIVEQ, void, loc), '/' => return lex_comment(lexr, loc), * => ltok::DIV, }, _: io::EOF => ltok::DIV, }, '%' => match (n) { r: rune => switch (r) { '=' => return (ltok::MODEQ, void, loc), * => ltok::MODULO, }, _: io::EOF => ltok::MODULO, }, '+' => match (n) { r: rune => switch (r) { '=' => return (ltok::PLUSEQ, void, loc), * => ltok::PLUS, }, _: io::EOF => ltok::PLUS, }, '-' => match (n) { r: rune => switch (r) { '=' => return (ltok::MINUSEQ, void, loc), * => if (ascii::isdigit(r)) { unget(lexr, r); unget(lexr, '-'); return lex_literal(lexr, loc); } else { ltok::MINUS; }, }, _: io::EOF => ltok::MINUS, }, ':' => match (n) { r: rune => switch (r) { ':' => return (ltok::DOUBLE_COLON, void, loc), * => if (is_name(r, false)) { unget(lexr, r); let tok = lex_name(lexr, loc, false)?; tok.0 = ltok::LABEL; return tok; } else ltok::COLON, }, _: io::EOF => ltok::COLON, }, '!' => match (n) { r: rune => switch (r) { '=' => return (ltok::NEQUAL, void, loc), * => ltok::LNOT, }, _: io::EOF => ltok::LNOT, }, '&' => match (n) { r: rune => switch (r) { '&' => return (ltok::LAND, void, loc), '=' => return (ltok::ANDEQ, void, loc), * => ltok::BAND, }, _: io::EOF => ltok::BAND, }, '|' => match (n) { r: rune => switch (r) { '|' => return (ltok::LOR, void, loc), '=' => return (ltok::OREQ, void, loc), * => ltok::BOR, }, _: io::EOF => ltok::BOR, }, '=' => match (n) { r: rune => switch (r) { '=' => return (ltok::LEQUAL, void, loc), '>' => return (ltok::CASE, void, loc), * => ltok::EQUAL, }, _: io::EOF => ltok::EQUAL, }, * => return syntaxerr(loc, "unknown token sequence"), }; unget(lexr, n); return (tok, void, loc); }; fn lex3(lex: *lexer, loc: location, r: rune) (token | error) = { let n = match (next(lex)?) { _: io::EOF => return switch (r) { '.' => (ltok::DOT, void, loc), '<' => (ltok::LESS, void, loc), '>' => (ltok::GREATER, void, loc), * => abort(), // Invariant }, r: rune => r, }; return switch (r) { '.' => lex3dot(lex, loc, n), '<' => lex3lt(lex, loc, n), '>' => lex3gt(lex, loc, n), * => syntaxerr(loc, "unknown token sequence"), }; }; fn lex3dot(lex: *lexer, loc: location, n: rune) (token | error) = { let tok: ltok = switch (n) { '.' => { let q = match (next(lex)?) { _: io::EOF => io::EOF, r: rune => r, }; let t = match (q) { r: rune => switch (r) { '.' => return (ltok::ELLIPSIS, void, loc), * => ltok::SLICE, }, _: io::EOF => ltok::SLICE, }; unget(lex, q); t; }, * => { unget(lex, n); ltok::DOT; } }; return (tok, void, loc); }; fn lex3lt(lex: *lexer, loc: location, n: rune) (token | error) = { let tok: ltok = switch (n) { '<' => { let q = match (next(lex)?) { _: io::EOF => io::EOF, r: rune => r, }; let t = match (q) { r: rune => switch (r) { '=' => return (ltok::LSHIFTEQ, void, loc), * => ltok::LSHIFT, }, _: io::EOF => ltok::LSHIFT, }; unget(lex, q); t; }, '=' => ltok::LESSEQ, * => { unget(lex, n); ltok::LESS; } }; return (tok, void, loc); }; fn lex3gt(lex: *lexer, loc: location, n: rune) (token | error) = { let tok: ltok = switch (n) { '>' => { let q = match (next(lex)?) { _: io::EOF => io::EOF, r: rune => r, }; let t = match (q) { r: rune => switch (r) { '=' => return (ltok::RSHIFTEQ, void, loc), * => ltok::RSHIFT, }, _: io::EOF => ltok::RSHIFT, }; unget(lex, q); t; }, '=' => ltok::GREATEREQ, * => { unget(lex, n); ltok::GREATER; } }; return (tok, void, loc); }; // Unlex a single token. The next call to [[lex]] will return this token. Only one // unlex is supported at a time; you must call [[lex]] before calling [[unlex]] // again. export fn unlex(lex: *lexer, tok: token) void = { assert(lex.un is void, "attempted to unlex more than one token"); lex.un = tok; }; fn next(lex: *lexer) (rune | io::EOF | io::error) = { match (lex.rb[0]) { _: void => void, r: (rune | io::EOF) => { lex.rb[0] = lex.rb[1]; lex.rb[1] = void; return r; }, }; for (true) { return match (bufio::scanrune(lex.in)) { e: (io::EOF | io::error) => e, r: rune => { lexloc(lex, r); r; }, }; }; abort("unreachable"); }; fn nextw(lex: *lexer) ((rune, location) | io::EOF | io::error) = { for (true) { let loc = mkloc(lex); match (next(lex)) { e: (io::error | io::EOF) => return e, r: rune => if (!ascii::isspace(r)) { return (r, loc); } else { free(lex.comment); lex.comment = ""; }, }; }; abort(); }; fn lexloc(lex: *lexer, r: rune) void = { switch (r) { '\n' => { lex.loc.0 += 1; lex.loc.1 = 1; }, '\t' => lex.loc.1 += 8, * => lex.loc.1 += 1, }; }; fn unget(lex: *lexer, r: (rune | io::EOF)) void = { if (!(lex.rb[0] is void)) { assert(lex.rb[1] is void, "ungot too many runes"); lex.rb[1] = lex.rb[0]; }; lex.rb[0] = r; }; fn mkloc(lex: *lexer) location = location { path = lex.path, line = lex.loc.0, col = lex.loc.1, }; fn syntaxerr(loc: location, why: str) error = (loc, why);