Added a tokenizer

This commit is contained in:
Eduard Urbach 2023-10-31 11:57:37 +01:00
parent 8b19989372
commit ac157e580c
Signed by: akyoto
GPG Key ID: C874F672B1AF20C0
7 changed files with 274 additions and 12 deletions

View File

@ -8,6 +8,7 @@ import (
"git.akyoto.dev/cli/q/src/directory"
"git.akyoto.dev/cli/q/src/log"
"git.akyoto.dev/cli/q/src/token"
)
// Scan scans the directory.
@ -51,13 +52,17 @@ func scanDirectory(path string, functions chan<- *Function, errors chan<- error)
// scanFile scans a single file.
func scanFile(path string, functions chan<- *Function) error {
log.Info.Println(path)
contents, err := os.ReadFile(path)
if err != nil {
return err
}
log.Info.Println(string(contents))
tokens := token.Tokenize(contents)
for _, t := range tokens {
log.Info.Println(t.Kind, t.Position, strings.TrimSpace(t.String()))
}
return nil
}

View File

@ -1,7 +1,6 @@
package cli
import (
"path/filepath"
"strings"
"git.akyoto.dev/cli/q/src/build"
@ -30,15 +29,7 @@ func Build(args []string) int {
}
}
fullPath, err := filepath.Abs(b.Directory)
if err != nil {
log.Error.Println(err)
return 1
}
b.Directory = fullPath
err = b.Run()
err := b.Run()
if err != nil {
log.Error.Println(err)

6
src/keywords/All.go Normal file
View File

@ -0,0 +1,6 @@
package keywords
// All defines the keywords used in the language.
var All = map[string]bool{
"return": true,
}

116
src/token/Kind.go Normal file
View File

@ -0,0 +1,116 @@
package token
// Kind represents the type of token.
type Kind uint8
const (
// Invalid represents an invalid token.
Invalid Kind = iota
// NewLine represents the newline character.
NewLine
// Identifier represents a series of characters used to identify a variable or function.
Identifier
// Keyword represents a language keyword.
Keyword
// Text represents an uninterpreted series of characters in the source code.
Text
// Number represents a series of numerical characters.
Number
// Operator represents a mathematical operator.
Operator
// Separator represents a comma.
Separator
// Range represents '..'.
Range
// Question represents '?'.
Question
// Comment represents a comment.
Comment
// GroupStart represents '('.
GroupStart
// GroupEnd represents ')'.
GroupEnd
// BlockStart represents '{'.
BlockStart
// BlockEnd represents '}'.
BlockEnd
// ArrayStart represents '['.
ArrayStart
// ArrayEnd represents ']'.
ArrayEnd
)
// String returns the text representation.
func (kind Kind) String() string {
switch kind {
case NewLine:
return "NewLine"
case Identifier:
return "Identifier"
case Keyword:
return "Keyword"
case Text:
return "Text"
case Number:
return "Number"
case Operator:
return "Operator"
case Separator:
return "Separator"
case Range:
return "Range"
case Question:
return "Question"
case Comment:
return "Comment"
case GroupStart:
return "GroupStart"
case GroupEnd:
return "GroupEnd"
case BlockStart:
return "BlockStart"
case BlockEnd:
return "BlockEnd"
case ArrayStart:
return "ArrayStart"
case ArrayEnd:
return "ArrayEnd"
case Invalid:
return "Invalid"
default:
return "<undefined token>"
}
}

17
src/token/List.go Normal file
View File

@ -0,0 +1,17 @@
package token
import "strings"
// List is a slice of tokens.
type List []Token
// String implements string serialization.
func (list List) String() string {
builder := strings.Builder{}
for _, t := range list {
builder.WriteString(t.String())
}
return builder.String()
}

15
src/token/Token.go Normal file
View File

@ -0,0 +1,15 @@
package token
// Token represents a single element in a source file.
// The characters that make up an identifier are grouped into a single token.
// This makes parsing easier and allows us to do better syntax checks.
type Token struct {
Kind Kind
Position int
Bytes []byte
}
// String returns the token text.
func (t Token) String() string {
return string(t.Bytes)
}

112
src/token/Tokenize.go Normal file
View File

@ -0,0 +1,112 @@
package token
import "git.akyoto.dev/cli/q/src/keywords"
// Pre-allocate these byte buffers so we can re-use them
// instead of allocating a new buffer every time.
var (
groupStartBytes = []byte{'('}
groupEndBytes = []byte{')'}
blockStartBytes = []byte{'{'}
blockEndBytes = []byte{'}'}
arrayStartBytes = []byte{'['}
arrayEndBytes = []byte{']'}
separatorBytes = []byte{','}
newLineBytes = []byte{'\n'}
)
// Tokenize turns the file contents into a list of tokens.
func Tokenize(buffer []byte) List {
var (
i int
c byte
tokens = make(List, 0, len(buffer)/2)
)
for i < len(buffer) {
c = buffer[i]
switch {
// Identifiers
case isIdentifierStart(c):
position := i
i++
for i < len(buffer) && isIdentifier(buffer[i]) {
i++
}
token := Token{
Identifier,
position,
buffer[position:i],
}
if keywords.All[string(token.Bytes)] {
token.Kind = Keyword
}
tokens = append(tokens, token)
i--
// Texts
case c == '"':
i++
position := i
for i < len(buffer) && buffer[i] != '"' {
i++
}
tokens = append(tokens, Token{
Text,
position,
buffer[position:i],
})
// Parentheses start
case c == '(':
tokens = append(tokens, Token{GroupStart, i, groupStartBytes})
// Parentheses end
case c == ')':
tokens = append(tokens, Token{GroupEnd, i, groupEndBytes})
// Block start
case c == '{':
tokens = append(tokens, Token{BlockStart, i, blockStartBytes})
// Block end
case c == '}':
tokens = append(tokens, Token{BlockEnd, i, blockEndBytes})
// Array start
case c == '[':
tokens = append(tokens, Token{ArrayStart, i, arrayStartBytes})
// Array end
case c == ']':
tokens = append(tokens, Token{ArrayEnd, i, arrayEndBytes})
// Separator
case c == ',':
tokens = append(tokens, Token{Separator, i, separatorBytes})
// New line
case c == '\n':
tokens = append(tokens, Token{NewLine, i, newLineBytes})
}
i++
}
return tokens
}
func isIdentifierStart(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
}
func isIdentifier(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= '0' && c <= '9')
}