Added fast tokenizer

This commit is contained in:
2025-03-19 21:59:27 +01:00
parent 9a8bf8ff64
commit eeeff379e2
10 changed files with 172 additions and 105 deletions

View File

@ -1,8 +1,6 @@
package scanner
import (
"os"
"git.urbach.dev/cli/q/src/errors"
"git.urbach.dev/cli/q/src/fs"
"git.urbach.dev/cli/q/src/token"
@ -10,17 +8,19 @@ import (
// scanFile scans a single file.
func (s *Scanner) scanFile(path string, pkg string) error {
contents, err := os.ReadFile(path)
reader := token.Reader{}
err := reader.Open(path)
if err != nil {
return err
}
tokens := token.Tokenize(contents)
defer reader.File.Close()
tokens := token.Tokenize(&reader)
file := &fs.File{
Path: path,
Bytes: contents,
Bytes: reader.Buffer,
Tokens: tokens,
Package: pkg,
}

71
src/token/Reader.go Normal file
View File

@ -0,0 +1,71 @@
package token
import (
"io"
"os"
)
type Reader struct {
File *os.File
Buffer []byte
Size Position
Position Position
}
func (t *Reader) Advance() {
t.Position++
if t.Position >= Position(len(t.Buffer)) {
t.read()
}
}
func (t *Reader) Current() byte {
return t.Buffer[t.Position]
}
func (t *Reader) Next() byte {
if t.Position+1 >= Position(len(t.Buffer)) {
t.read()
}
return t.Buffer[t.Position+1]
}
func (t *Reader) read() error {
n, err := t.File.Read(t.Buffer[len(t.Buffer):cap(t.Buffer)])
t.Buffer = t.Buffer[:len(t.Buffer)+n]
if err != nil {
if err == io.EOF {
return nil
}
return err
}
if len(t.Buffer) >= cap(t.Buffer) {
d := append(t.Buffer[:cap(t.Buffer)], 0)
t.Buffer = d[:len(t.Buffer)]
}
return nil
}
func (t *Reader) Open(path string) (err error) {
t.File, err = os.Open(path)
if err != nil {
return err
}
info, err := t.File.Stat()
if err != nil {
return err
}
t.Size = Position(info.Size())
t.Buffer = make([]byte, 0, t.Size+1)
return nil
}

View File

@ -1,64 +1,65 @@
package token
// Tokenize turns the file contents into a list of tokens.
func Tokenize(buffer []byte) List {
func Tokenize(reader *Reader) List {
var (
i Position
tokens = make(List, 0, 8+len(buffer)/2)
tokens = make(List, 0, 8+reader.Size/2)
)
for i < Position(len(buffer)) {
switch buffer[i] {
reader.read()
for reader.Position < reader.Size {
switch reader.Current() {
case ' ', '\t':
case ',':
tokens = append(tokens, Token{Kind: Separator, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: Separator, Position: reader.Position, Length: 1})
case '(':
tokens = append(tokens, Token{Kind: GroupStart, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: GroupStart, Position: reader.Position, Length: 1})
case ')':
tokens = append(tokens, Token{Kind: GroupEnd, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: GroupEnd, Position: reader.Position, Length: 1})
case '{':
tokens = append(tokens, Token{Kind: BlockStart, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: BlockStart, Position: reader.Position, Length: 1})
case '}':
tokens = append(tokens, Token{Kind: BlockEnd, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: BlockEnd, Position: reader.Position, Length: 1})
case '[':
tokens = append(tokens, Token{Kind: ArrayStart, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: ArrayStart, Position: reader.Position, Length: 1})
case ']':
tokens = append(tokens, Token{Kind: ArrayEnd, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: ArrayEnd, Position: reader.Position, Length: 1})
case '\n':
tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: NewLine, Position: reader.Position, Length: 1})
case '-':
tokens, i = dash(tokens, buffer, i)
tokens = dash(tokens, reader)
case '/':
tokens, i = slash(tokens, buffer, i)
tokens = slash(tokens, reader)
continue
case '"', '\'':
tokens, i = quote(tokens, buffer, i)
tokens = quote(tokens, reader)
continue
case '0':
tokens, i = zero(tokens, buffer, i)
tokens = zero(tokens, reader)
continue
default:
if isIdentifierStart(buffer[i]) {
tokens, i = identifier(tokens, buffer, i)
if isIdentifierStart(reader.Current()) {
tokens = identifier(tokens, reader)
continue
}
if isDigit(buffer[i]) {
tokens, i = digit(tokens, buffer, i)
if isDigit(reader.Current()) {
tokens = digit(tokens, reader)
continue
}
if isOperator(buffer[i]) {
tokens, i = operator(tokens, buffer, i)
if isOperator(reader.Current()) {
tokens = operator(tokens, reader)
continue
}
tokens = append(tokens, Token{Kind: Invalid, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: Invalid, Position: reader.Position, Length: 1})
}
i++
reader.Advance()
}
tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0})
tokens = append(tokens, Token{Kind: EOF, Position: reader.Position, Length: 0})
return tokens
}

View File

@ -1,25 +1,25 @@
package token
// dash handles all tokens starting with '-'.
func dash(tokens List, buffer []byte, i Position) (List, Position) {
func dash(tokens List, reader *Reader) List {
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: Negate, Position: reader.Position, Length: 1})
} else {
if i+1 < Position(len(buffer)) {
switch buffer[i+1] {
if reader.Position+1 < reader.Size {
switch reader.Next() {
case '=':
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2})
i++
tokens = append(tokens, Token{Kind: SubAssign, Position: reader.Position, Length: 2})
reader.Advance()
case '>':
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2})
i++
tokens = append(tokens, Token{Kind: ReturnType, Position: reader.Position, Length: 2})
reader.Advance()
default:
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
}
} else {
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
}
}
return tokens, i
return tokens
}

View File

@ -1,24 +1,24 @@
package token
// digit handles all tokens that qualify as a digit.
func digit(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
func digit(tokens List, reader *Reader) List {
position := reader.Position
reader.Advance()
for i < Position(len(buffer)) && isDigit(buffer[i]) {
i++
for reader.Position < reader.Size && isDigit(reader.Current()) {
reader.Advance()
}
last := len(tokens) - 1
if len(tokens) > 0 && tokens[last].Kind == Negate {
tokens[last].Kind = Number
tokens[last].Length = Length(i-position) + 1
tokens[last].Length = Length(reader.Position-position) + 1
} else {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
}
return tokens, i
return tokens
}
func isDigit(c byte) bool {

View File

@ -1,15 +1,15 @@
package token
// identifier handles all tokens that qualify as an identifier.
func identifier(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
func identifier(tokens List, reader *Reader) List {
position := reader.Position
reader.Advance()
for i < Position(len(buffer)) && isIdentifier(buffer[i]) {
i++
for reader.Position < reader.Size && isIdentifier(reader.Current()) {
reader.Advance()
}
identifier := buffer[position:i]
identifier := reader.Buffer[position:reader.Position]
kind := Identifier
switch string(identifier) {
@ -37,8 +37,7 @@ func identifier(tokens List, buffer []byte, i Position) (List, Position) {
kind = Switch
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
return tokens, i
return append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
}
func isIdentifier(c byte) bool {

View File

@ -1,17 +1,17 @@
package token
// operator handles all tokens that qualify as an operator.
func operator(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
func operator(tokens List, reader *Reader) List {
position := reader.Position
reader.Advance()
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
for reader.Position < reader.Size && isOperator(reader.Current()) {
reader.Advance()
}
kind := Invalid
switch string(buffer[position:i]) {
switch string(reader.Buffer[position:reader.Position]) {
case "!":
kind = Not
case "!=":
@ -72,8 +72,7 @@ func operator(tokens List, buffer []byte, i Position) (List, Position) {
kind = LogicalOr
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
return tokens, i
return append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
}
func isOperator(c byte) bool {

View File

@ -1,20 +1,20 @@
package token
// quote handles all tokens starting with a single or double quote.
func quote(tokens List, buffer []byte, i Position) (List, Position) {
limiter := buffer[i]
start := i
end := Position(len(buffer))
i++
func quote(tokens List, reader *Reader) List {
limiter := reader.Current()
start := reader.Position
end := reader.Size
reader.Advance()
for i < Position(len(buffer)) {
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') {
end = i + 1
i++
for reader.Position < reader.Size {
if reader.Current() == limiter && (reader.Buffer[reader.Position-1] != '\\' || reader.Buffer[reader.Position-2] == '\\') {
end = reader.Position + 1
reader.Advance()
break
}
i++
reader.Advance()
}
kind := String
@ -23,6 +23,5 @@ func quote(tokens List, buffer []byte, i Position) (List, Position) {
kind = Rune
}
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
return tokens, i
return append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
}

View File

@ -1,34 +1,34 @@
package token
// slash handles all tokens starting with '/'.
func slash(tokens List, buffer []byte, i Position) (List, Position) {
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' {
position := i
func slash(tokens List, reader *Reader) List {
if reader.Next() == '/' {
position := reader.Position
for i < Position(len(buffer)) && buffer[i] != '\n' {
i++
for reader.Position < reader.Size && reader.Current() != '\n' {
reader.Advance()
}
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)})
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(reader.Position - position)})
} else {
position := i
i++
position := reader.Position
reader.Advance()
for i < Position(len(buffer)) && isOperator(buffer[i]) {
i++
for reader.Position < reader.Size && isOperator(reader.Current()) {
reader.Advance()
}
kind := Invalid
switch string(buffer[position:i]) {
switch string(reader.Buffer[position:reader.Position]) {
case "/":
kind = Div
case "/=":
kind = DivAssign
}
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
}
return tokens, i
return tokens
}

View File

@ -1,35 +1,33 @@
package token
// zero handles all tokens starting with a '0'.
func zero(tokens List, buffer []byte, i Position) (List, Position) {
position := i
i++
func zero(tokens List, reader *Reader) List {
position := reader.Position
reader.Advance()
if i >= Position(len(buffer)) {
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1})
return tokens, i
if reader.Position >= reader.Size {
return append(tokens, Token{Kind: Number, Position: position, Length: 1})
}
filter := isDigit
switch buffer[i] {
switch reader.Current() {
case 'x':
i++
reader.Advance()
filter = isHexDigit
case 'b':
i++
reader.Advance()
filter = isBinaryDigit
case 'o':
i++
reader.Advance()
filter = isOctalDigit
}
for i < Position(len(buffer)) && filter(buffer[i]) {
i++
for reader.Position < reader.Size && filter(reader.Current()) {
reader.Advance()
}
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
return tokens, i
return append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
}