Added fast tokenizer

This commit is contained in:
2025-03-19 21:59:27 +01:00
parent 9a8bf8ff64
commit eeeff379e2
10 changed files with 172 additions and 105 deletions

View File

@ -1,8 +1,6 @@
package scanner package scanner
import ( import (
"os"
"git.urbach.dev/cli/q/src/errors" "git.urbach.dev/cli/q/src/errors"
"git.urbach.dev/cli/q/src/fs" "git.urbach.dev/cli/q/src/fs"
"git.urbach.dev/cli/q/src/token" "git.urbach.dev/cli/q/src/token"
@ -10,17 +8,19 @@ import (
// scanFile scans a single file. // scanFile scans a single file.
func (s *Scanner) scanFile(path string, pkg string) error { func (s *Scanner) scanFile(path string, pkg string) error {
contents, err := os.ReadFile(path) reader := token.Reader{}
err := reader.Open(path)
if err != nil { if err != nil {
return err return err
} }
tokens := token.Tokenize(contents) defer reader.File.Close()
tokens := token.Tokenize(&reader)
file := &fs.File{ file := &fs.File{
Path: path, Path: path,
Bytes: contents, Bytes: reader.Buffer,
Tokens: tokens, Tokens: tokens,
Package: pkg, Package: pkg,
} }

71
src/token/Reader.go Normal file
View File

@ -0,0 +1,71 @@
package token
import (
"io"
"os"
)
type Reader struct {
File *os.File
Buffer []byte
Size Position
Position Position
}
func (t *Reader) Advance() {
t.Position++
if t.Position >= Position(len(t.Buffer)) {
t.read()
}
}
func (t *Reader) Current() byte {
return t.Buffer[t.Position]
}
func (t *Reader) Next() byte {
if t.Position+1 >= Position(len(t.Buffer)) {
t.read()
}
return t.Buffer[t.Position+1]
}
func (t *Reader) read() error {
n, err := t.File.Read(t.Buffer[len(t.Buffer):cap(t.Buffer)])
t.Buffer = t.Buffer[:len(t.Buffer)+n]
if err != nil {
if err == io.EOF {
return nil
}
return err
}
if len(t.Buffer) >= cap(t.Buffer) {
d := append(t.Buffer[:cap(t.Buffer)], 0)
t.Buffer = d[:len(t.Buffer)]
}
return nil
}
func (t *Reader) Open(path string) (err error) {
t.File, err = os.Open(path)
if err != nil {
return err
}
info, err := t.File.Stat()
if err != nil {
return err
}
t.Size = Position(info.Size())
t.Buffer = make([]byte, 0, t.Size+1)
return nil
}

View File

@ -1,64 +1,65 @@
package token package token
// Tokenize turns the file contents into a list of tokens. // Tokenize turns the file contents into a list of tokens.
func Tokenize(buffer []byte) List { func Tokenize(reader *Reader) List {
var ( var (
i Position tokens = make(List, 0, 8+reader.Size/2)
tokens = make(List, 0, 8+len(buffer)/2)
) )
for i < Position(len(buffer)) { reader.read()
switch buffer[i] {
for reader.Position < reader.Size {
switch reader.Current() {
case ' ', '\t': case ' ', '\t':
case ',': case ',':
tokens = append(tokens, Token{Kind: Separator, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: Separator, Position: reader.Position, Length: 1})
case '(': case '(':
tokens = append(tokens, Token{Kind: GroupStart, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: GroupStart, Position: reader.Position, Length: 1})
case ')': case ')':
tokens = append(tokens, Token{Kind: GroupEnd, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: GroupEnd, Position: reader.Position, Length: 1})
case '{': case '{':
tokens = append(tokens, Token{Kind: BlockStart, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: BlockStart, Position: reader.Position, Length: 1})
case '}': case '}':
tokens = append(tokens, Token{Kind: BlockEnd, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: BlockEnd, Position: reader.Position, Length: 1})
case '[': case '[':
tokens = append(tokens, Token{Kind: ArrayStart, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: ArrayStart, Position: reader.Position, Length: 1})
case ']': case ']':
tokens = append(tokens, Token{Kind: ArrayEnd, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: ArrayEnd, Position: reader.Position, Length: 1})
case '\n': case '\n':
tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: NewLine, Position: reader.Position, Length: 1})
case '-': case '-':
tokens, i = dash(tokens, buffer, i) tokens = dash(tokens, reader)
case '/': case '/':
tokens, i = slash(tokens, buffer, i) tokens = slash(tokens, reader)
continue continue
case '"', '\'': case '"', '\'':
tokens, i = quote(tokens, buffer, i) tokens = quote(tokens, reader)
continue continue
case '0': case '0':
tokens, i = zero(tokens, buffer, i) tokens = zero(tokens, reader)
continue continue
default: default:
if isIdentifierStart(buffer[i]) { if isIdentifierStart(reader.Current()) {
tokens, i = identifier(tokens, buffer, i) tokens = identifier(tokens, reader)
continue continue
} }
if isDigit(buffer[i]) { if isDigit(reader.Current()) {
tokens, i = digit(tokens, buffer, i) tokens = digit(tokens, reader)
continue continue
} }
if isOperator(buffer[i]) { if isOperator(reader.Current()) {
tokens, i = operator(tokens, buffer, i) tokens = operator(tokens, reader)
continue continue
} }
tokens = append(tokens, Token{Kind: Invalid, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: Invalid, Position: reader.Position, Length: 1})
} }
i++ reader.Advance()
} }
tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0}) tokens = append(tokens, Token{Kind: EOF, Position: reader.Position, Length: 0})
return tokens return tokens
} }

View File

@ -1,25 +1,25 @@
package token package token
// dash handles all tokens starting with '-'. // dash handles all tokens starting with '-'.
func dash(tokens List, buffer []byte, i Position) (List, Position) { func dash(tokens List, reader *Reader) List {
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() { if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: Negate, Position: reader.Position, Length: 1})
} else { } else {
if i+1 < Position(len(buffer)) { if reader.Position+1 < reader.Size {
switch buffer[i+1] { switch reader.Next() {
case '=': case '=':
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2}) tokens = append(tokens, Token{Kind: SubAssign, Position: reader.Position, Length: 2})
i++ reader.Advance()
case '>': case '>':
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2}) tokens = append(tokens, Token{Kind: ReturnType, Position: reader.Position, Length: 2})
i++ reader.Advance()
default: default:
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
} }
} else { } else {
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1}) tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
} }
} }
return tokens, i return tokens
} }

View File

@ -1,24 +1,24 @@
package token package token
// digit handles all tokens that qualify as a digit. // digit handles all tokens that qualify as a digit.
func digit(tokens List, buffer []byte, i Position) (List, Position) { func digit(tokens List, reader *Reader) List {
position := i position := reader.Position
i++ reader.Advance()
for i < Position(len(buffer)) && isDigit(buffer[i]) { for reader.Position < reader.Size && isDigit(reader.Current()) {
i++ reader.Advance()
} }
last := len(tokens) - 1 last := len(tokens) - 1
if len(tokens) > 0 && tokens[last].Kind == Negate { if len(tokens) > 0 && tokens[last].Kind == Negate {
tokens[last].Kind = Number tokens[last].Kind = Number
tokens[last].Length = Length(i-position) + 1 tokens[last].Length = Length(reader.Position-position) + 1
} else { } else {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)}) tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
} }
return tokens, i return tokens
} }
func isDigit(c byte) bool { func isDigit(c byte) bool {

View File

@ -1,15 +1,15 @@
package token package token
// identifier handles all tokens that qualify as an identifier. // identifier handles all tokens that qualify as an identifier.
func identifier(tokens List, buffer []byte, i Position) (List, Position) { func identifier(tokens List, reader *Reader) List {
position := i position := reader.Position
i++ reader.Advance()
for i < Position(len(buffer)) && isIdentifier(buffer[i]) { for reader.Position < reader.Size && isIdentifier(reader.Current()) {
i++ reader.Advance()
} }
identifier := buffer[position:i] identifier := reader.Buffer[position:reader.Position]
kind := Identifier kind := Identifier
switch string(identifier) { switch string(identifier) {
@ -37,8 +37,7 @@ func identifier(tokens List, buffer []byte, i Position) (List, Position) {
kind = Switch kind = Switch
} }
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))}) return append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
return tokens, i
} }
func isIdentifier(c byte) bool { func isIdentifier(c byte) bool {

View File

@ -1,17 +1,17 @@
package token package token
// operator handles all tokens that qualify as an operator. // operator handles all tokens that qualify as an operator.
func operator(tokens List, buffer []byte, i Position) (List, Position) { func operator(tokens List, reader *Reader) List {
position := i position := reader.Position
i++ reader.Advance()
for i < Position(len(buffer)) && isOperator(buffer[i]) { for reader.Position < reader.Size && isOperator(reader.Current()) {
i++ reader.Advance()
} }
kind := Invalid kind := Invalid
switch string(buffer[position:i]) { switch string(reader.Buffer[position:reader.Position]) {
case "!": case "!":
kind = Not kind = Not
case "!=": case "!=":
@ -72,8 +72,7 @@ func operator(tokens List, buffer []byte, i Position) (List, Position) {
kind = LogicalOr kind = LogicalOr
} }
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)}) return append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
return tokens, i
} }
func isOperator(c byte) bool { func isOperator(c byte) bool {

View File

@ -1,20 +1,20 @@
package token package token
// quote handles all tokens starting with a single or double quote. // quote handles all tokens starting with a single or double quote.
func quote(tokens List, buffer []byte, i Position) (List, Position) { func quote(tokens List, reader *Reader) List {
limiter := buffer[i] limiter := reader.Current()
start := i start := reader.Position
end := Position(len(buffer)) end := reader.Size
i++ reader.Advance()
for i < Position(len(buffer)) { for reader.Position < reader.Size {
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') { if reader.Current() == limiter && (reader.Buffer[reader.Position-1] != '\\' || reader.Buffer[reader.Position-2] == '\\') {
end = i + 1 end = reader.Position + 1
i++ reader.Advance()
break break
} }
i++ reader.Advance()
} }
kind := String kind := String
@ -23,6 +23,5 @@ func quote(tokens List, buffer []byte, i Position) (List, Position) {
kind = Rune kind = Rune
} }
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)}) return append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
return tokens, i
} }

View File

@ -1,34 +1,34 @@
package token package token
// slash handles all tokens starting with '/'. // slash handles all tokens starting with '/'.
func slash(tokens List, buffer []byte, i Position) (List, Position) { func slash(tokens List, reader *Reader) List {
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' { if reader.Next() == '/' {
position := i position := reader.Position
for i < Position(len(buffer)) && buffer[i] != '\n' { for reader.Position < reader.Size && reader.Current() != '\n' {
i++ reader.Advance()
} }
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)}) tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(reader.Position - position)})
} else { } else {
position := i position := reader.Position
i++ reader.Advance()
for i < Position(len(buffer)) && isOperator(buffer[i]) { for reader.Position < reader.Size && isOperator(reader.Current()) {
i++ reader.Advance()
} }
kind := Invalid kind := Invalid
switch string(buffer[position:i]) { switch string(reader.Buffer[position:reader.Position]) {
case "/": case "/":
kind = Div kind = Div
case "/=": case "/=":
kind = DivAssign kind = DivAssign
} }
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)}) tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
} }
return tokens, i return tokens
} }

View File

@ -1,35 +1,33 @@
package token package token
// zero handles all tokens starting with a '0'. // zero handles all tokens starting with a '0'.
func zero(tokens List, buffer []byte, i Position) (List, Position) { func zero(tokens List, reader *Reader) List {
position := i position := reader.Position
i++ reader.Advance()
if i >= Position(len(buffer)) { if reader.Position >= reader.Size {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1}) return append(tokens, Token{Kind: Number, Position: position, Length: 1})
return tokens, i
} }
filter := isDigit filter := isDigit
switch buffer[i] { switch reader.Current() {
case 'x': case 'x':
i++ reader.Advance()
filter = isHexDigit filter = isHexDigit
case 'b': case 'b':
i++ reader.Advance()
filter = isBinaryDigit filter = isBinaryDigit
case 'o': case 'o':
i++ reader.Advance()
filter = isOctalDigit filter = isOctalDigit
} }
for i < Position(len(buffer)) && filter(buffer[i]) { for reader.Position < reader.Size && filter(reader.Current()) {
i++ reader.Advance()
} }
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)}) return append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
return tokens, i
} }