From ac157e580cc7f2e3fb94dc3507c6d0df6a8fec66 Mon Sep 17 00:00:00 2001 From: Eduard Urbach Date: Tue, 31 Oct 2023 11:57:37 +0100 Subject: [PATCH] Added a tokenizer --- src/build/Scan.go | 9 +++- src/cli/Build.go | 11 +--- src/keywords/All.go | 6 +++ src/token/Kind.go | 116 ++++++++++++++++++++++++++++++++++++++++++ src/token/List.go | 17 +++++++ src/token/Token.go | 15 ++++++ src/token/Tokenize.go | 112 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 274 insertions(+), 12 deletions(-) create mode 100644 src/keywords/All.go create mode 100644 src/token/Kind.go create mode 100644 src/token/List.go create mode 100644 src/token/Token.go create mode 100644 src/token/Tokenize.go diff --git a/src/build/Scan.go b/src/build/Scan.go index 9475727..229cead 100644 --- a/src/build/Scan.go +++ b/src/build/Scan.go @@ -8,6 +8,7 @@ import ( "git.akyoto.dev/cli/q/src/directory" "git.akyoto.dev/cli/q/src/log" + "git.akyoto.dev/cli/q/src/token" ) // Scan scans the directory. @@ -51,13 +52,17 @@ func scanDirectory(path string, functions chan<- *Function, errors chan<- error) // scanFile scans a single file. func scanFile(path string, functions chan<- *Function) error { - log.Info.Println(path) contents, err := os.ReadFile(path) if err != nil { return err } - log.Info.Println(string(contents)) + tokens := token.Tokenize(contents) + + for _, t := range tokens { + log.Info.Println(t.Kind, t.Position, strings.TrimSpace(t.String())) + } + return nil } diff --git a/src/cli/Build.go b/src/cli/Build.go index fefbe24..ec1bf6a 100644 --- a/src/cli/Build.go +++ b/src/cli/Build.go @@ -1,7 +1,6 @@ package cli import ( - "path/filepath" "strings" "git.akyoto.dev/cli/q/src/build" @@ -30,15 +29,7 @@ func Build(args []string) int { } } - fullPath, err := filepath.Abs(b.Directory) - - if err != nil { - log.Error.Println(err) - return 1 - } - - b.Directory = fullPath - err = b.Run() + err := b.Run() if err != nil { log.Error.Println(err) diff --git a/src/keywords/All.go b/src/keywords/All.go new file mode 100644 index 0000000..5df0a39 --- /dev/null +++ b/src/keywords/All.go @@ -0,0 +1,6 @@ +package keywords + +// All defines the keywords used in the language. +var All = map[string]bool{ + "return": true, +} diff --git a/src/token/Kind.go b/src/token/Kind.go new file mode 100644 index 0000000..742c381 --- /dev/null +++ b/src/token/Kind.go @@ -0,0 +1,116 @@ +package token + +// Kind represents the type of token. +type Kind uint8 + +const ( + // Invalid represents an invalid token. + Invalid Kind = iota + + // NewLine represents the newline character. + NewLine + + // Identifier represents a series of characters used to identify a variable or function. + Identifier + + // Keyword represents a language keyword. + Keyword + + // Text represents an uninterpreted series of characters in the source code. + Text + + // Number represents a series of numerical characters. + Number + + // Operator represents a mathematical operator. + Operator + + // Separator represents a comma. + Separator + + // Range represents '..'. + Range + + // Question represents '?'. + Question + + // Comment represents a comment. + Comment + + // GroupStart represents '('. + GroupStart + + // GroupEnd represents ')'. + GroupEnd + + // BlockStart represents '{'. + BlockStart + + // BlockEnd represents '}'. + BlockEnd + + // ArrayStart represents '['. + ArrayStart + + // ArrayEnd represents ']'. + ArrayEnd +) + +// String returns the text representation. +func (kind Kind) String() string { + switch kind { + case NewLine: + return "NewLine" + + case Identifier: + return "Identifier" + + case Keyword: + return "Keyword" + + case Text: + return "Text" + + case Number: + return "Number" + + case Operator: + return "Operator" + + case Separator: + return "Separator" + + case Range: + return "Range" + + case Question: + return "Question" + + case Comment: + return "Comment" + + case GroupStart: + return "GroupStart" + + case GroupEnd: + return "GroupEnd" + + case BlockStart: + return "BlockStart" + + case BlockEnd: + return "BlockEnd" + + case ArrayStart: + return "ArrayStart" + + case ArrayEnd: + return "ArrayEnd" + + case Invalid: + return "Invalid" + + default: + return "" + } +} diff --git a/src/token/List.go b/src/token/List.go new file mode 100644 index 0000000..d5ea0ad --- /dev/null +++ b/src/token/List.go @@ -0,0 +1,17 @@ +package token + +import "strings" + +// List is a slice of tokens. +type List []Token + +// String implements string serialization. +func (list List) String() string { + builder := strings.Builder{} + + for _, t := range list { + builder.WriteString(t.String()) + } + + return builder.String() +} diff --git a/src/token/Token.go b/src/token/Token.go new file mode 100644 index 0000000..f88b69c --- /dev/null +++ b/src/token/Token.go @@ -0,0 +1,15 @@ +package token + +// Token represents a single element in a source file. +// The characters that make up an identifier are grouped into a single token. +// This makes parsing easier and allows us to do better syntax checks. +type Token struct { + Kind Kind + Position int + Bytes []byte +} + +// String returns the token text. +func (t Token) String() string { + return string(t.Bytes) +} diff --git a/src/token/Tokenize.go b/src/token/Tokenize.go new file mode 100644 index 0000000..9d359b8 --- /dev/null +++ b/src/token/Tokenize.go @@ -0,0 +1,112 @@ +package token + +import "git.akyoto.dev/cli/q/src/keywords" + +// Pre-allocate these byte buffers so we can re-use them +// instead of allocating a new buffer every time. +var ( + groupStartBytes = []byte{'('} + groupEndBytes = []byte{')'} + blockStartBytes = []byte{'{'} + blockEndBytes = []byte{'}'} + arrayStartBytes = []byte{'['} + arrayEndBytes = []byte{']'} + separatorBytes = []byte{','} + newLineBytes = []byte{'\n'} +) + +// Tokenize turns the file contents into a list of tokens. +func Tokenize(buffer []byte) List { + var ( + i int + c byte + tokens = make(List, 0, len(buffer)/2) + ) + + for i < len(buffer) { + c = buffer[i] + + switch { + // Identifiers + case isIdentifierStart(c): + position := i + i++ + + for i < len(buffer) && isIdentifier(buffer[i]) { + i++ + } + + token := Token{ + Identifier, + position, + buffer[position:i], + } + + if keywords.All[string(token.Bytes)] { + token.Kind = Keyword + } + + tokens = append(tokens, token) + i-- + + // Texts + case c == '"': + i++ + position := i + + for i < len(buffer) && buffer[i] != '"' { + i++ + } + + tokens = append(tokens, Token{ + Text, + position, + buffer[position:i], + }) + + // Parentheses start + case c == '(': + tokens = append(tokens, Token{GroupStart, i, groupStartBytes}) + + // Parentheses end + case c == ')': + tokens = append(tokens, Token{GroupEnd, i, groupEndBytes}) + + // Block start + case c == '{': + tokens = append(tokens, Token{BlockStart, i, blockStartBytes}) + + // Block end + case c == '}': + tokens = append(tokens, Token{BlockEnd, i, blockEndBytes}) + + // Array start + case c == '[': + tokens = append(tokens, Token{ArrayStart, i, arrayStartBytes}) + + // Array end + case c == ']': + tokens = append(tokens, Token{ArrayEnd, i, arrayEndBytes}) + + // Separator + case c == ',': + tokens = append(tokens, Token{Separator, i, separatorBytes}) + + // New line + case c == '\n': + tokens = append(tokens, Token{NewLine, i, newLineBytes}) + } + + i++ + } + + return tokens +} + +func isIdentifierStart(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' +} + +func isIdentifier(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= '0' && c <= '9') +}