Added fast tokenizer
This commit is contained in:
@ -1,8 +1,6 @@
|
||||
package scanner
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.urbach.dev/cli/q/src/errors"
|
||||
"git.urbach.dev/cli/q/src/fs"
|
||||
"git.urbach.dev/cli/q/src/token"
|
||||
@ -10,17 +8,19 @@ import (
|
||||
|
||||
// scanFile scans a single file.
|
||||
func (s *Scanner) scanFile(path string, pkg string) error {
|
||||
contents, err := os.ReadFile(path)
|
||||
reader := token.Reader{}
|
||||
err := reader.Open(path)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tokens := token.Tokenize(contents)
|
||||
defer reader.File.Close()
|
||||
tokens := token.Tokenize(&reader)
|
||||
|
||||
file := &fs.File{
|
||||
Path: path,
|
||||
Bytes: contents,
|
||||
Bytes: reader.Buffer,
|
||||
Tokens: tokens,
|
||||
Package: pkg,
|
||||
}
|
||||
|
71
src/token/Reader.go
Normal file
71
src/token/Reader.go
Normal file
@ -0,0 +1,71 @@
|
||||
package token
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
File *os.File
|
||||
Buffer []byte
|
||||
Size Position
|
||||
Position Position
|
||||
}
|
||||
|
||||
func (t *Reader) Advance() {
|
||||
t.Position++
|
||||
|
||||
if t.Position >= Position(len(t.Buffer)) {
|
||||
t.read()
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Reader) Current() byte {
|
||||
return t.Buffer[t.Position]
|
||||
}
|
||||
|
||||
func (t *Reader) Next() byte {
|
||||
if t.Position+1 >= Position(len(t.Buffer)) {
|
||||
t.read()
|
||||
}
|
||||
|
||||
return t.Buffer[t.Position+1]
|
||||
}
|
||||
|
||||
func (t *Reader) read() error {
|
||||
n, err := t.File.Read(t.Buffer[len(t.Buffer):cap(t.Buffer)])
|
||||
t.Buffer = t.Buffer[:len(t.Buffer)+n]
|
||||
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
return nil
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
if len(t.Buffer) >= cap(t.Buffer) {
|
||||
d := append(t.Buffer[:cap(t.Buffer)], 0)
|
||||
t.Buffer = d[:len(t.Buffer)]
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Reader) Open(path string) (err error) {
|
||||
t.File, err = os.Open(path)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
info, err := t.File.Stat()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Size = Position(info.Size())
|
||||
t.Buffer = make([]byte, 0, t.Size+1)
|
||||
return nil
|
||||
}
|
@ -1,64 +1,65 @@
|
||||
package token
|
||||
|
||||
// Tokenize turns the file contents into a list of tokens.
|
||||
func Tokenize(buffer []byte) List {
|
||||
func Tokenize(reader *Reader) List {
|
||||
var (
|
||||
i Position
|
||||
tokens = make(List, 0, 8+len(buffer)/2)
|
||||
tokens = make(List, 0, 8+reader.Size/2)
|
||||
)
|
||||
|
||||
for i < Position(len(buffer)) {
|
||||
switch buffer[i] {
|
||||
reader.read()
|
||||
|
||||
for reader.Position < reader.Size {
|
||||
switch reader.Current() {
|
||||
case ' ', '\t':
|
||||
case ',':
|
||||
tokens = append(tokens, Token{Kind: Separator, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: Separator, Position: reader.Position, Length: 1})
|
||||
case '(':
|
||||
tokens = append(tokens, Token{Kind: GroupStart, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: GroupStart, Position: reader.Position, Length: 1})
|
||||
case ')':
|
||||
tokens = append(tokens, Token{Kind: GroupEnd, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: GroupEnd, Position: reader.Position, Length: 1})
|
||||
case '{':
|
||||
tokens = append(tokens, Token{Kind: BlockStart, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: BlockStart, Position: reader.Position, Length: 1})
|
||||
case '}':
|
||||
tokens = append(tokens, Token{Kind: BlockEnd, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: BlockEnd, Position: reader.Position, Length: 1})
|
||||
case '[':
|
||||
tokens = append(tokens, Token{Kind: ArrayStart, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: ArrayStart, Position: reader.Position, Length: 1})
|
||||
case ']':
|
||||
tokens = append(tokens, Token{Kind: ArrayEnd, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: ArrayEnd, Position: reader.Position, Length: 1})
|
||||
case '\n':
|
||||
tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: NewLine, Position: reader.Position, Length: 1})
|
||||
case '-':
|
||||
tokens, i = dash(tokens, buffer, i)
|
||||
tokens = dash(tokens, reader)
|
||||
case '/':
|
||||
tokens, i = slash(tokens, buffer, i)
|
||||
tokens = slash(tokens, reader)
|
||||
continue
|
||||
case '"', '\'':
|
||||
tokens, i = quote(tokens, buffer, i)
|
||||
tokens = quote(tokens, reader)
|
||||
continue
|
||||
case '0':
|
||||
tokens, i = zero(tokens, buffer, i)
|
||||
tokens = zero(tokens, reader)
|
||||
continue
|
||||
default:
|
||||
if isIdentifierStart(buffer[i]) {
|
||||
tokens, i = identifier(tokens, buffer, i)
|
||||
if isIdentifierStart(reader.Current()) {
|
||||
tokens = identifier(tokens, reader)
|
||||
continue
|
||||
}
|
||||
|
||||
if isDigit(buffer[i]) {
|
||||
tokens, i = digit(tokens, buffer, i)
|
||||
if isDigit(reader.Current()) {
|
||||
tokens = digit(tokens, reader)
|
||||
continue
|
||||
}
|
||||
|
||||
if isOperator(buffer[i]) {
|
||||
tokens, i = operator(tokens, buffer, i)
|
||||
if isOperator(reader.Current()) {
|
||||
tokens = operator(tokens, reader)
|
||||
continue
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: Invalid, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: Invalid, Position: reader.Position, Length: 1})
|
||||
}
|
||||
|
||||
i++
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0})
|
||||
tokens = append(tokens, Token{Kind: EOF, Position: reader.Position, Length: 0})
|
||||
return tokens
|
||||
}
|
||||
|
@ -1,25 +1,25 @@
|
||||
package token
|
||||
|
||||
// dash handles all tokens starting with '-'.
|
||||
func dash(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
func dash(tokens List, reader *Reader) List {
|
||||
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
|
||||
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: Negate, Position: reader.Position, Length: 1})
|
||||
} else {
|
||||
if i+1 < Position(len(buffer)) {
|
||||
switch buffer[i+1] {
|
||||
if reader.Position+1 < reader.Size {
|
||||
switch reader.Next() {
|
||||
case '=':
|
||||
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2})
|
||||
i++
|
||||
tokens = append(tokens, Token{Kind: SubAssign, Position: reader.Position, Length: 2})
|
||||
reader.Advance()
|
||||
case '>':
|
||||
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2})
|
||||
i++
|
||||
tokens = append(tokens, Token{Kind: ReturnType, Position: reader.Position, Length: 2})
|
||||
reader.Advance()
|
||||
default:
|
||||
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
|
||||
}
|
||||
} else {
|
||||
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
|
||||
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
|
||||
}
|
||||
}
|
||||
|
||||
return tokens, i
|
||||
return tokens
|
||||
}
|
||||
|
@ -1,24 +1,24 @@
|
||||
package token
|
||||
|
||||
// digit handles all tokens that qualify as a digit.
|
||||
func digit(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
position := i
|
||||
i++
|
||||
func digit(tokens List, reader *Reader) List {
|
||||
position := reader.Position
|
||||
reader.Advance()
|
||||
|
||||
for i < Position(len(buffer)) && isDigit(buffer[i]) {
|
||||
i++
|
||||
for reader.Position < reader.Size && isDigit(reader.Current()) {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
last := len(tokens) - 1
|
||||
|
||||
if len(tokens) > 0 && tokens[last].Kind == Negate {
|
||||
tokens[last].Kind = Number
|
||||
tokens[last].Length = Length(i-position) + 1
|
||||
tokens[last].Length = Length(reader.Position-position) + 1
|
||||
} else {
|
||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
|
||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
|
||||
}
|
||||
|
||||
return tokens, i
|
||||
return tokens
|
||||
}
|
||||
|
||||
func isDigit(c byte) bool {
|
||||
|
@ -1,15 +1,15 @@
|
||||
package token
|
||||
|
||||
// identifier handles all tokens that qualify as an identifier.
|
||||
func identifier(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
position := i
|
||||
i++
|
||||
func identifier(tokens List, reader *Reader) List {
|
||||
position := reader.Position
|
||||
reader.Advance()
|
||||
|
||||
for i < Position(len(buffer)) && isIdentifier(buffer[i]) {
|
||||
i++
|
||||
for reader.Position < reader.Size && isIdentifier(reader.Current()) {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
identifier := buffer[position:i]
|
||||
identifier := reader.Buffer[position:reader.Position]
|
||||
kind := Identifier
|
||||
|
||||
switch string(identifier) {
|
||||
@ -37,8 +37,7 @@ func identifier(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
kind = Switch
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
|
||||
return tokens, i
|
||||
return append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
|
||||
}
|
||||
|
||||
func isIdentifier(c byte) bool {
|
||||
|
@ -1,17 +1,17 @@
|
||||
package token
|
||||
|
||||
// operator handles all tokens that qualify as an operator.
|
||||
func operator(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
position := i
|
||||
i++
|
||||
func operator(tokens List, reader *Reader) List {
|
||||
position := reader.Position
|
||||
reader.Advance()
|
||||
|
||||
for i < Position(len(buffer)) && isOperator(buffer[i]) {
|
||||
i++
|
||||
for reader.Position < reader.Size && isOperator(reader.Current()) {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
kind := Invalid
|
||||
|
||||
switch string(buffer[position:i]) {
|
||||
switch string(reader.Buffer[position:reader.Position]) {
|
||||
case "!":
|
||||
kind = Not
|
||||
case "!=":
|
||||
@ -72,8 +72,7 @@ func operator(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
kind = LogicalOr
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
|
||||
return tokens, i
|
||||
return append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
|
||||
}
|
||||
|
||||
func isOperator(c byte) bool {
|
||||
|
@ -1,20 +1,20 @@
|
||||
package token
|
||||
|
||||
// quote handles all tokens starting with a single or double quote.
|
||||
func quote(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
limiter := buffer[i]
|
||||
start := i
|
||||
end := Position(len(buffer))
|
||||
i++
|
||||
func quote(tokens List, reader *Reader) List {
|
||||
limiter := reader.Current()
|
||||
start := reader.Position
|
||||
end := reader.Size
|
||||
reader.Advance()
|
||||
|
||||
for i < Position(len(buffer)) {
|
||||
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') {
|
||||
end = i + 1
|
||||
i++
|
||||
for reader.Position < reader.Size {
|
||||
if reader.Current() == limiter && (reader.Buffer[reader.Position-1] != '\\' || reader.Buffer[reader.Position-2] == '\\') {
|
||||
end = reader.Position + 1
|
||||
reader.Advance()
|
||||
break
|
||||
}
|
||||
|
||||
i++
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
kind := String
|
||||
@ -23,6 +23,5 @@ func quote(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
kind = Rune
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
|
||||
return tokens, i
|
||||
return append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
|
||||
}
|
||||
|
@ -1,34 +1,34 @@
|
||||
package token
|
||||
|
||||
// slash handles all tokens starting with '/'.
|
||||
func slash(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' {
|
||||
position := i
|
||||
func slash(tokens List, reader *Reader) List {
|
||||
if reader.Next() == '/' {
|
||||
position := reader.Position
|
||||
|
||||
for i < Position(len(buffer)) && buffer[i] != '\n' {
|
||||
i++
|
||||
for reader.Position < reader.Size && reader.Current() != '\n' {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)})
|
||||
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(reader.Position - position)})
|
||||
} else {
|
||||
position := i
|
||||
i++
|
||||
position := reader.Position
|
||||
reader.Advance()
|
||||
|
||||
for i < Position(len(buffer)) && isOperator(buffer[i]) {
|
||||
i++
|
||||
for reader.Position < reader.Size && isOperator(reader.Current()) {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
kind := Invalid
|
||||
|
||||
switch string(buffer[position:i]) {
|
||||
switch string(reader.Buffer[position:reader.Position]) {
|
||||
case "/":
|
||||
kind = Div
|
||||
case "/=":
|
||||
kind = DivAssign
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
|
||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
|
||||
}
|
||||
|
||||
return tokens, i
|
||||
return tokens
|
||||
}
|
||||
|
@ -1,35 +1,33 @@
|
||||
package token
|
||||
|
||||
// zero handles all tokens starting with a '0'.
|
||||
func zero(tokens List, buffer []byte, i Position) (List, Position) {
|
||||
position := i
|
||||
i++
|
||||
func zero(tokens List, reader *Reader) List {
|
||||
position := reader.Position
|
||||
reader.Advance()
|
||||
|
||||
if i >= Position(len(buffer)) {
|
||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1})
|
||||
return tokens, i
|
||||
if reader.Position >= reader.Size {
|
||||
return append(tokens, Token{Kind: Number, Position: position, Length: 1})
|
||||
}
|
||||
|
||||
filter := isDigit
|
||||
|
||||
switch buffer[i] {
|
||||
switch reader.Current() {
|
||||
case 'x':
|
||||
i++
|
||||
reader.Advance()
|
||||
filter = isHexDigit
|
||||
|
||||
case 'b':
|
||||
i++
|
||||
reader.Advance()
|
||||
filter = isBinaryDigit
|
||||
|
||||
case 'o':
|
||||
i++
|
||||
reader.Advance()
|
||||
filter = isOctalDigit
|
||||
}
|
||||
|
||||
for i < Position(len(buffer)) && filter(buffer[i]) {
|
||||
i++
|
||||
for reader.Position < reader.Size && filter(reader.Current()) {
|
||||
reader.Advance()
|
||||
}
|
||||
|
||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
|
||||
return tokens, i
|
||||
return append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
|
||||
}
|
||||
|
Reference in New Issue
Block a user