Added fast tokenizer
This commit is contained in:
@ -1,8 +1,6 @@
|
|||||||
package scanner
|
package scanner
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
|
||||||
|
|
||||||
"git.urbach.dev/cli/q/src/errors"
|
"git.urbach.dev/cli/q/src/errors"
|
||||||
"git.urbach.dev/cli/q/src/fs"
|
"git.urbach.dev/cli/q/src/fs"
|
||||||
"git.urbach.dev/cli/q/src/token"
|
"git.urbach.dev/cli/q/src/token"
|
||||||
@ -10,17 +8,19 @@ import (
|
|||||||
|
|
||||||
// scanFile scans a single file.
|
// scanFile scans a single file.
|
||||||
func (s *Scanner) scanFile(path string, pkg string) error {
|
func (s *Scanner) scanFile(path string, pkg string) error {
|
||||||
contents, err := os.ReadFile(path)
|
reader := token.Reader{}
|
||||||
|
err := reader.Open(path)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens := token.Tokenize(contents)
|
defer reader.File.Close()
|
||||||
|
tokens := token.Tokenize(&reader)
|
||||||
|
|
||||||
file := &fs.File{
|
file := &fs.File{
|
||||||
Path: path,
|
Path: path,
|
||||||
Bytes: contents,
|
Bytes: reader.Buffer,
|
||||||
Tokens: tokens,
|
Tokens: tokens,
|
||||||
Package: pkg,
|
Package: pkg,
|
||||||
}
|
}
|
||||||
|
71
src/token/Reader.go
Normal file
71
src/token/Reader.go
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
package token
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Reader struct {
|
||||||
|
File *os.File
|
||||||
|
Buffer []byte
|
||||||
|
Size Position
|
||||||
|
Position Position
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Reader) Advance() {
|
||||||
|
t.Position++
|
||||||
|
|
||||||
|
if t.Position >= Position(len(t.Buffer)) {
|
||||||
|
t.read()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Reader) Current() byte {
|
||||||
|
return t.Buffer[t.Position]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Reader) Next() byte {
|
||||||
|
if t.Position+1 >= Position(len(t.Buffer)) {
|
||||||
|
t.read()
|
||||||
|
}
|
||||||
|
|
||||||
|
return t.Buffer[t.Position+1]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Reader) read() error {
|
||||||
|
n, err := t.File.Read(t.Buffer[len(t.Buffer):cap(t.Buffer)])
|
||||||
|
t.Buffer = t.Buffer[:len(t.Buffer)+n]
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(t.Buffer) >= cap(t.Buffer) {
|
||||||
|
d := append(t.Buffer[:cap(t.Buffer)], 0)
|
||||||
|
t.Buffer = d[:len(t.Buffer)]
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *Reader) Open(path string) (err error) {
|
||||||
|
t.File, err = os.Open(path)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
info, err := t.File.Stat()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Size = Position(info.Size())
|
||||||
|
t.Buffer = make([]byte, 0, t.Size+1)
|
||||||
|
return nil
|
||||||
|
}
|
@ -1,64 +1,65 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// Tokenize turns the file contents into a list of tokens.
|
// Tokenize turns the file contents into a list of tokens.
|
||||||
func Tokenize(buffer []byte) List {
|
func Tokenize(reader *Reader) List {
|
||||||
var (
|
var (
|
||||||
i Position
|
tokens = make(List, 0, 8+reader.Size/2)
|
||||||
tokens = make(List, 0, 8+len(buffer)/2)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for i < Position(len(buffer)) {
|
reader.read()
|
||||||
switch buffer[i] {
|
|
||||||
|
for reader.Position < reader.Size {
|
||||||
|
switch reader.Current() {
|
||||||
case ' ', '\t':
|
case ' ', '\t':
|
||||||
case ',':
|
case ',':
|
||||||
tokens = append(tokens, Token{Kind: Separator, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: Separator, Position: reader.Position, Length: 1})
|
||||||
case '(':
|
case '(':
|
||||||
tokens = append(tokens, Token{Kind: GroupStart, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: GroupStart, Position: reader.Position, Length: 1})
|
||||||
case ')':
|
case ')':
|
||||||
tokens = append(tokens, Token{Kind: GroupEnd, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: GroupEnd, Position: reader.Position, Length: 1})
|
||||||
case '{':
|
case '{':
|
||||||
tokens = append(tokens, Token{Kind: BlockStart, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: BlockStart, Position: reader.Position, Length: 1})
|
||||||
case '}':
|
case '}':
|
||||||
tokens = append(tokens, Token{Kind: BlockEnd, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: BlockEnd, Position: reader.Position, Length: 1})
|
||||||
case '[':
|
case '[':
|
||||||
tokens = append(tokens, Token{Kind: ArrayStart, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: ArrayStart, Position: reader.Position, Length: 1})
|
||||||
case ']':
|
case ']':
|
||||||
tokens = append(tokens, Token{Kind: ArrayEnd, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: ArrayEnd, Position: reader.Position, Length: 1})
|
||||||
case '\n':
|
case '\n':
|
||||||
tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: NewLine, Position: reader.Position, Length: 1})
|
||||||
case '-':
|
case '-':
|
||||||
tokens, i = dash(tokens, buffer, i)
|
tokens = dash(tokens, reader)
|
||||||
case '/':
|
case '/':
|
||||||
tokens, i = slash(tokens, buffer, i)
|
tokens = slash(tokens, reader)
|
||||||
continue
|
continue
|
||||||
case '"', '\'':
|
case '"', '\'':
|
||||||
tokens, i = quote(tokens, buffer, i)
|
tokens = quote(tokens, reader)
|
||||||
continue
|
continue
|
||||||
case '0':
|
case '0':
|
||||||
tokens, i = zero(tokens, buffer, i)
|
tokens = zero(tokens, reader)
|
||||||
continue
|
continue
|
||||||
default:
|
default:
|
||||||
if isIdentifierStart(buffer[i]) {
|
if isIdentifierStart(reader.Current()) {
|
||||||
tokens, i = identifier(tokens, buffer, i)
|
tokens = identifier(tokens, reader)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if isDigit(buffer[i]) {
|
if isDigit(reader.Current()) {
|
||||||
tokens, i = digit(tokens, buffer, i)
|
tokens = digit(tokens, reader)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if isOperator(buffer[i]) {
|
if isOperator(reader.Current()) {
|
||||||
tokens, i = operator(tokens, buffer, i)
|
tokens = operator(tokens, reader)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: Invalid, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: Invalid, Position: reader.Position, Length: 1})
|
||||||
}
|
}
|
||||||
|
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0})
|
tokens = append(tokens, Token{Kind: EOF, Position: reader.Position, Length: 0})
|
||||||
return tokens
|
return tokens
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,25 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// dash handles all tokens starting with '-'.
|
// dash handles all tokens starting with '-'.
|
||||||
func dash(tokens List, buffer []byte, i Position) (List, Position) {
|
func dash(tokens List, reader *Reader) List {
|
||||||
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
|
if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() {
|
||||||
tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: Negate, Position: reader.Position, Length: 1})
|
||||||
} else {
|
} else {
|
||||||
if i+1 < Position(len(buffer)) {
|
if reader.Position+1 < reader.Size {
|
||||||
switch buffer[i+1] {
|
switch reader.Next() {
|
||||||
case '=':
|
case '=':
|
||||||
tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2})
|
tokens = append(tokens, Token{Kind: SubAssign, Position: reader.Position, Length: 2})
|
||||||
i++
|
reader.Advance()
|
||||||
case '>':
|
case '>':
|
||||||
tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2})
|
tokens = append(tokens, Token{Kind: ReturnType, Position: reader.Position, Length: 2})
|
||||||
i++
|
reader.Advance()
|
||||||
default:
|
default:
|
||||||
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1})
|
tokens = append(tokens, Token{Kind: Sub, Position: reader.Position, Length: 1})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens, i
|
return tokens
|
||||||
}
|
}
|
||||||
|
@ -1,24 +1,24 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// digit handles all tokens that qualify as a digit.
|
// digit handles all tokens that qualify as a digit.
|
||||||
func digit(tokens List, buffer []byte, i Position) (List, Position) {
|
func digit(tokens List, reader *Reader) List {
|
||||||
position := i
|
position := reader.Position
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
for i < Position(len(buffer)) && isDigit(buffer[i]) {
|
for reader.Position < reader.Size && isDigit(reader.Current()) {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
last := len(tokens) - 1
|
last := len(tokens) - 1
|
||||||
|
|
||||||
if len(tokens) > 0 && tokens[last].Kind == Negate {
|
if len(tokens) > 0 && tokens[last].Kind == Negate {
|
||||||
tokens[last].Kind = Number
|
tokens[last].Kind = Number
|
||||||
tokens[last].Length = Length(i-position) + 1
|
tokens[last].Length = Length(reader.Position-position) + 1
|
||||||
} else {
|
} else {
|
||||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
|
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens, i
|
return tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
func isDigit(c byte) bool {
|
func isDigit(c byte) bool {
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// identifier handles all tokens that qualify as an identifier.
|
// identifier handles all tokens that qualify as an identifier.
|
||||||
func identifier(tokens List, buffer []byte, i Position) (List, Position) {
|
func identifier(tokens List, reader *Reader) List {
|
||||||
position := i
|
position := reader.Position
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
for i < Position(len(buffer)) && isIdentifier(buffer[i]) {
|
for reader.Position < reader.Size && isIdentifier(reader.Current()) {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
identifier := buffer[position:i]
|
identifier := reader.Buffer[position:reader.Position]
|
||||||
kind := Identifier
|
kind := Identifier
|
||||||
|
|
||||||
switch string(identifier) {
|
switch string(identifier) {
|
||||||
@ -37,8 +37,7 @@ func identifier(tokens List, buffer []byte, i Position) (List, Position) {
|
|||||||
kind = Switch
|
kind = Switch
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
|
return append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))})
|
||||||
return tokens, i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isIdentifier(c byte) bool {
|
func isIdentifier(c byte) bool {
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// operator handles all tokens that qualify as an operator.
|
// operator handles all tokens that qualify as an operator.
|
||||||
func operator(tokens List, buffer []byte, i Position) (List, Position) {
|
func operator(tokens List, reader *Reader) List {
|
||||||
position := i
|
position := reader.Position
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
for i < Position(len(buffer)) && isOperator(buffer[i]) {
|
for reader.Position < reader.Size && isOperator(reader.Current()) {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
kind := Invalid
|
kind := Invalid
|
||||||
|
|
||||||
switch string(buffer[position:i]) {
|
switch string(reader.Buffer[position:reader.Position]) {
|
||||||
case "!":
|
case "!":
|
||||||
kind = Not
|
kind = Not
|
||||||
case "!=":
|
case "!=":
|
||||||
@ -72,8 +72,7 @@ func operator(tokens List, buffer []byte, i Position) (List, Position) {
|
|||||||
kind = LogicalOr
|
kind = LogicalOr
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
|
return append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
|
||||||
return tokens, i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isOperator(c byte) bool {
|
func isOperator(c byte) bool {
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// quote handles all tokens starting with a single or double quote.
|
// quote handles all tokens starting with a single or double quote.
|
||||||
func quote(tokens List, buffer []byte, i Position) (List, Position) {
|
func quote(tokens List, reader *Reader) List {
|
||||||
limiter := buffer[i]
|
limiter := reader.Current()
|
||||||
start := i
|
start := reader.Position
|
||||||
end := Position(len(buffer))
|
end := reader.Size
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
for i < Position(len(buffer)) {
|
for reader.Position < reader.Size {
|
||||||
if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') {
|
if reader.Current() == limiter && (reader.Buffer[reader.Position-1] != '\\' || reader.Buffer[reader.Position-2] == '\\') {
|
||||||
end = i + 1
|
end = reader.Position + 1
|
||||||
i++
|
reader.Advance()
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
kind := String
|
kind := String
|
||||||
@ -23,6 +23,5 @@ func quote(tokens List, buffer []byte, i Position) (List, Position) {
|
|||||||
kind = Rune
|
kind = Rune
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
|
return append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)})
|
||||||
return tokens, i
|
|
||||||
}
|
}
|
||||||
|
@ -1,34 +1,34 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// slash handles all tokens starting with '/'.
|
// slash handles all tokens starting with '/'.
|
||||||
func slash(tokens List, buffer []byte, i Position) (List, Position) {
|
func slash(tokens List, reader *Reader) List {
|
||||||
if i+1 < Position(len(buffer)) && buffer[i+1] == '/' {
|
if reader.Next() == '/' {
|
||||||
position := i
|
position := reader.Position
|
||||||
|
|
||||||
for i < Position(len(buffer)) && buffer[i] != '\n' {
|
for reader.Position < reader.Size && reader.Current() != '\n' {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)})
|
tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(reader.Position - position)})
|
||||||
} else {
|
} else {
|
||||||
position := i
|
position := reader.Position
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
for i < Position(len(buffer)) && isOperator(buffer[i]) {
|
for reader.Position < reader.Size && isOperator(reader.Current()) {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
kind := Invalid
|
kind := Invalid
|
||||||
|
|
||||||
switch string(buffer[position:i]) {
|
switch string(reader.Buffer[position:reader.Position]) {
|
||||||
case "/":
|
case "/":
|
||||||
kind = Div
|
kind = Div
|
||||||
case "/=":
|
case "/=":
|
||||||
kind = DivAssign
|
kind = DivAssign
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)})
|
tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(reader.Position - position)})
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens, i
|
return tokens
|
||||||
}
|
}
|
||||||
|
@ -1,35 +1,33 @@
|
|||||||
package token
|
package token
|
||||||
|
|
||||||
// zero handles all tokens starting with a '0'.
|
// zero handles all tokens starting with a '0'.
|
||||||
func zero(tokens List, buffer []byte, i Position) (List, Position) {
|
func zero(tokens List, reader *Reader) List {
|
||||||
position := i
|
position := reader.Position
|
||||||
i++
|
reader.Advance()
|
||||||
|
|
||||||
if i >= Position(len(buffer)) {
|
if reader.Position >= reader.Size {
|
||||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1})
|
return append(tokens, Token{Kind: Number, Position: position, Length: 1})
|
||||||
return tokens, i
|
|
||||||
}
|
}
|
||||||
|
|
||||||
filter := isDigit
|
filter := isDigit
|
||||||
|
|
||||||
switch buffer[i] {
|
switch reader.Current() {
|
||||||
case 'x':
|
case 'x':
|
||||||
i++
|
reader.Advance()
|
||||||
filter = isHexDigit
|
filter = isHexDigit
|
||||||
|
|
||||||
case 'b':
|
case 'b':
|
||||||
i++
|
reader.Advance()
|
||||||
filter = isBinaryDigit
|
filter = isBinaryDigit
|
||||||
|
|
||||||
case 'o':
|
case 'o':
|
||||||
i++
|
reader.Advance()
|
||||||
filter = isOctalDigit
|
filter = isOctalDigit
|
||||||
}
|
}
|
||||||
|
|
||||||
for i < Position(len(buffer)) && filter(buffer[i]) {
|
for reader.Position < reader.Size && filter(reader.Current()) {
|
||||||
i++
|
reader.Advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)})
|
return append(tokens, Token{Kind: Number, Position: position, Length: Length(reader.Position - position)})
|
||||||
return tokens, i
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user