Simplified tokenizer

This commit is contained in:
Eduard Urbach 2025-02-02 11:11:59 +01:00
parent 1be26f288c
commit 858d0f21cf
Signed by: akyoto
GPG Key ID: C874F672B1AF20C0
9 changed files with 300 additions and 261 deletions

View File

@ -1,4 +1,7 @@
package ast
// Node is an interface used for all types of AST nodes.
type Node any
// AST is an abstract syntax tree which is simply a list of nodes.
type AST []Node

View File

@ -27,246 +27,29 @@ func Tokenize(buffer []byte) List {
case '\n':
tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1})
case '-':
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1})
} else {
if i+1 < Position(len(buffer)) {
switch buffer[i+1] {
case '=':
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2})
i++
case '>':
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2})
i++
default:
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
}
} else {
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
}
}
tokens, i = dash(tokens, buffer, i)
case '/':
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' {
position := i
for i < Position(len(buffer)) && buffer[i] != '\n' {
i++
}
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)})
} else {
position := i
i++
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
}
kind := Invalid
switch string(buffer[position:i]) {
case "/":
kind = Div
case "/=":
kind = DivAssign
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
}
tokens, i = slash(tokens, buffer, i)
continue
case '"', '\'':
limiter := buffer[i]
start := i
end := Position(len(buffer))
i++
for i < Position(len(buffer)) {
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') {
end = i + 1
i++
break
}
i++
}
kind := String
if limiter == '\'' {
kind = Rune
}
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
tokens, i = quote(tokens, buffer, i)
continue
case '0':
position := i
i++
if i >= Position(len(buffer)) {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1})
break
}
filter := isDigit
switch buffer[i] {
case 'x':
i++
filter = isHexDigit
case 'b':
i++
filter = isBinaryDigit
case 'o':
i++
filter = isOctalDigit
}
for i < Position(len(buffer)) && filter(buffer[i]) {
i++
}
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
tokens, i = zero(tokens, buffer, i)
continue
default:
if isIdentifierStart(buffer[i]) {
position := i
i++
for i < Position(len(buffer)) && isIdentifier(buffer[i]) {
i++
}
identifier := buffer[position:i]
kind := Identifier
switch string(identifier) {
case "assert":
kind = Assert
case "if":
kind = If
case "else":
kind = Else
case "import":
kind = Import
case "loop":
kind = Loop
case "return":
kind = Return
case "switch":
kind = Switch
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
tokens, i = identifier(tokens, buffer, i)
continue
}
if isDigit(buffer[i]) {
position := i
i++
for i < Position(len(buffer)) && isDigit(buffer[i]) {
i++
}
last := len(tokens) - 1
if len(tokens) > 0 && tokens[last].Kind == Negate {
tokens[last].Kind = Number
tokens[last].Length = Length(i-position) + 1
} else {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
}
tokens, i = digit(tokens, buffer, i)
continue
}
if isOperator(buffer[i]) {
position := i
i++
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
}
kind := Invalid
switch string(buffer[position:i]) {
case "!":
kind = Not
case "!=":
kind = NotEqual
case "%":
kind = Mod
case "%=":
kind = ModAssign
case "&":
kind = And
case "&&":
kind = LogicalAnd
case "&=":
kind = AndAssign
case "*":
kind = Mul
case "*=":
kind = MulAssign
case "+":
kind = Add
case "+=":
kind = AddAssign
// case "-":
// kind = Sub
// case "-=":
// kind = SubAssign
// case "->":
// kind = ReturnType
case ".":
kind = Period
// case "/":
// kind = Div
// case "/=":
// kind = DivAssign
case ":=":
kind = Define
case "<":
kind = Less
case "<<":
kind = Shl
case "<<=":
kind = ShlAssign
case "<=":
kind = LessEqual
case "=":
kind = Assign
case "==":
kind = Equal
case ">":
kind = Greater
case ">=":
kind = GreaterEqual
case ">>":
kind = Shr
case ">>=":
kind = ShrAssign
case "^":
kind = Xor
case "^=":
kind = XorAssign
case "|":
kind = Or
case "|=":
kind = OrAssign
case "||":
kind = LogicalOr
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
tokens, i = operator(tokens, buffer, i)
continue
}
@ -279,40 +62,3 @@ func Tokenize(buffer []byte) List {
tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0})
return tokens
}
func isIdentifier(c byte) bool {
return isLetter(c) || isDigit(c) || c == '_'
}
func isIdentifierStart(c byte) bool {
return isLetter(c) || c == '_'
}
func isLetter(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}
func isDigit(c byte) bool {
return c >= '0' && c <= '9'
}
func isHexDigit(c byte) bool {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
}
func isBinaryDigit(c byte) bool {
return c == '0' || c == '1'
}
func isOctalDigit(c byte) bool {
return c >= '0' && c <= '7'
}
func isOperator(c byte) bool {
switch c {
case '=', ':', '.', '+', '-', '*', '/', '<', '>', '&', '|', '^', '%', '!':
return true
default:
return false
}
}

25
src/token/dash.go Normal file
View File

@ -0,0 +1,25 @@
package token
// dash handles all tokens starting with '-'.
func dash(tokens List, buffer []byte, i Position) (List, Position) {
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1})
} else {
if i+1 < Position(len(buffer)) {
switch buffer[i+1] {
case '=':
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2})
i++
case '>':
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2})
i++
default:
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
}
} else {
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
}
}
return tokens, i
}

38
src/token/digit.go Normal file
View File

@ -0,0 +1,38 @@
package token
// digit handles all tokens that qualify as a digit.
func digit(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
for i < Position(len(buffer)) && isDigit(buffer[i]) {
i++
}
last := len(tokens) - 1
if len(tokens) > 0 && tokens[last].Kind == Negate {
tokens[last].Kind = Number
tokens[last].Length = Length(i-position) + 1
} else {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
}
return tokens, i
}
func isDigit(c byte) bool {
return c >= '0' && c <= '9'
}
func isHexDigit(c byte) bool {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
}
func isBinaryDigit(c byte) bool {
return c == '0' || c == '1'
}
func isOctalDigit(c byte) bool {
return c >= '0' && c <= '7'
}

46
src/token/identifier.go Normal file
View File

@ -0,0 +1,46 @@
package token
// identifier handles all tokens that qualify as an identifier.
func identifier(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
for i < Position(len(buffer)) && isIdentifier(buffer[i]) {
i++
}
identifier := buffer[position:i]
kind := Identifier
switch string(identifier) {
case "assert":
kind = Assert
case "if":
kind = If
case "else":
kind = Else
case "import":
kind = Import
case "loop":
kind = Loop
case "return":
kind = Return
case "switch":
kind = Switch
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
return tokens, i
}
func isIdentifier(c byte) bool {
return isLetter(c) || isDigit(c) || c == '_'
}
func isIdentifierStart(c byte) bool {
return isLetter(c) || c == '_'
}
func isLetter(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}

84
src/token/operator.go Normal file
View File

@ -0,0 +1,84 @@
package token
// operator handles all tokens that qualify as an operator.
func operator(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
}
kind := Invalid
switch string(buffer[position:i]) {
case "!":
kind = Not
case "!=":
kind = NotEqual
case "%":
kind = Mod
case "%=":
kind = ModAssign
case "&":
kind = And
case "&&":
kind = LogicalAnd
case "&=":
kind = AndAssign
case "*":
kind = Mul
case "*=":
kind = MulAssign
case "+":
kind = Add
case "+=":
kind = AddAssign
case ".":
kind = Period
case ":=":
kind = Define
case "<":
kind = Less
case "<<":
kind = Shl
case "<<=":
kind = ShlAssign
case "<=":
kind = LessEqual
case "=":
kind = Assign
case "==":
kind = Equal
case ">":
kind = Greater
case ">=":
kind = GreaterEqual
case ">>":
kind = Shr
case ">>=":
kind = ShrAssign
case "^":
kind = Xor
case "^=":
kind = XorAssign
case "|":
kind = Or
case "|=":
kind = OrAssign
case "||":
kind = LogicalOr
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
return tokens, i
}
func isOperator(c byte) bool {
switch c {
case '=', ':', '.', '+', '-', '*', '/', '<', '>', '&', '|', '^', '%', '!':
return true
default:
return false
}
}

28
src/token/quote.go Normal file
View File

@ -0,0 +1,28 @@
package token
// quote handles all tokens starting with a single or double quote.
func quote(tokens List, buffer []byte, i Position) (List, Position) {
limiter := buffer[i]
start := i
end := Position(len(buffer))
i++
for i < Position(len(buffer)) {
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') {
end = i + 1
i++
break
}
i++
}
kind := String
if limiter == '\'' {
kind = Rune
}
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
return tokens, i
}

34
src/token/slash.go Normal file
View File

@ -0,0 +1,34 @@
package token
// slash handles all tokens starting with '/'.
func slash(tokens List, buffer []byte, i Position) (List, Position) {
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' {
position := i
for i < Position(len(buffer)) && buffer[i] != '\n' {
i++
}
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)})
} else {
position := i
i++
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
}
kind := Invalid
switch string(buffer[position:i]) {
case "/":
kind = Div
case "/=":
kind = DivAssign
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
}
return tokens, i
}

35
src/token/zero.go Normal file
View File

@ -0,0 +1,35 @@
package token
// zero handles all tokens starting with a '0'.
func zero(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
if i >= Position(len(buffer)) {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1})
return tokens, i
}
filter := isDigit
switch buffer[i] {
case 'x':
i++
filter = isHexDigit
case 'b':
i++
filter = isBinaryDigit
case 'o':
i++
filter = isOctalDigit
}
for i < Position(len(buffer)) && filter(buffer[i]) {
i++
}
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
return tokens, i
}