Added a tokenizer

2023-10-31 11:57:37 +01:00
parent 8b19989372
commit ac157e580c
7 changed files with 274 additions and 12 deletions
--- a/src/build/Scan.go
+++ b/src/build/Scan.go
@ -8,6 +8,7 @@ import (

 	"git.akyoto.dev/cli/q/src/directory"
 	"git.akyoto.dev/cli/q/src/log"
+	"git.akyoto.dev/cli/q/src/token"
 )

 // Scan scans the directory.
@ -51,13 +52,17 @@ func scanDirectory(path string, functions chan<- *Function, errors chan<- error)

 // scanFile scans a single file.
 func scanFile(path string, functions chan<- *Function) error {
-	log.Info.Println(path)
 	contents, err := os.ReadFile(path)

 	if err != nil {
 		return err
 	}

-	log.Info.Println(string(contents))
+	tokens := token.Tokenize(contents)
+
+	for _, t := range tokens {
+		log.Info.Println(t.Kind, t.Position, strings.TrimSpace(t.String()))
+	}
+
 	return nil
 }
--- a/src/cli/Build.go
+++ b/src/cli/Build.go
@ -1,7 +1,6 @@
 package cli

 import (
-	"path/filepath"
 	"strings"

 	"git.akyoto.dev/cli/q/src/build"
@ -30,15 +29,7 @@ func Build(args []string) int {
 		}
 	}

-	fullPath, err := filepath.Abs(b.Directory)
-
-	if err != nil {
-		log.Error.Println(err)
-		return 1
-	}
-
-	b.Directory = fullPath
-	err = b.Run()
+	err := b.Run()

 	if err != nil {
 		log.Error.Println(err)
--- a/src/keywords/All.go
+++ b/src/keywords/All.go
@ -0,0 +1,6 @@
+package keywords
+
+// All defines the keywords used in the language.
+var All = map[string]bool{
+	"return": true,
+}
--- a/src/token/Kind.go
+++ b/src/token/Kind.go
@ -0,0 +1,116 @@
+package token
+
+// Kind represents the type of token.
+type Kind uint8
+
+const (
+	// Invalid represents an invalid token.
+	Invalid Kind = iota
+
+	// NewLine represents the newline character.
+	NewLine
+
+	// Identifier represents a series of characters used to identify a variable or function.
+	Identifier
+
+	// Keyword represents a language keyword.
+	Keyword
+
+	// Text represents an uninterpreted series of characters in the source code.
+	Text
+
+	// Number represents a series of numerical characters.
+	Number
+
+	// Operator represents a mathematical operator.
+	Operator
+
+	// Separator represents a comma.
+	Separator
+
+	// Range represents '..'.
+	Range
+
+	// Question represents '?'.
+	Question
+
+	// Comment represents a comment.
+	Comment
+
+	// GroupStart represents '('.
+	GroupStart
+
+	// GroupEnd represents ')'.
+	GroupEnd
+
+	// BlockStart represents '{'.
+	BlockStart
+
+	// BlockEnd represents '}'.
+	BlockEnd
+
+	// ArrayStart represents '['.
+	ArrayStart
+
+	// ArrayEnd represents ']'.
+	ArrayEnd
+)
+
+// String returns the text representation.
+func (kind Kind) String() string {
+	switch kind {
+	case NewLine:
+		return "NewLine"
+
+	case Identifier:
+		return "Identifier"
+
+	case Keyword:
+		return "Keyword"
+
+	case Text:
+		return "Text"
+
+	case Number:
+		return "Number"
+
+	case Operator:
+		return "Operator"
+
+	case Separator:
+		return "Separator"
+
+	case Range:
+		return "Range"
+
+	case Question:
+		return "Question"
+
+	case Comment:
+		return "Comment"
+
+	case GroupStart:
+		return "GroupStart"
+
+	case GroupEnd:
+		return "GroupEnd"
+
+	case BlockStart:
+		return "BlockStart"
+
+	case BlockEnd:
+		return "BlockEnd"
+
+	case ArrayStart:
+		return "ArrayStart"
+
+	case ArrayEnd:
+		return "ArrayEnd"
+
+	case Invalid:
+		return "Invalid"
+
+	default:
+		return "<undefined token>"
+	}
+}
--- a/src/token/List.go
+++ b/src/token/List.go
@ -0,0 +1,17 @@
+package token
+
+import "strings"
+
+// List is a slice of tokens.
+type List []Token
+
+// String implements string serialization.
+func (list List) String() string {
+	builder := strings.Builder{}
+
+	for _, t := range list {
+		builder.WriteString(t.String())
+	}
+
+	return builder.String()
+}
--- a/src/token/Token.go
+++ b/src/token/Token.go
@ -0,0 +1,15 @@
+package token
+
+// Token represents a single element in a source file.
+// The characters that make up an identifier are grouped into a single token.
+// This makes parsing easier and allows us to do better syntax checks.
+type Token struct {
+	Kind     Kind
+	Position int
+	Bytes    []byte
+}
+
+// String returns the token text.
+func (t Token) String() string {
+	return string(t.Bytes)
+}
--- a/src/token/Tokenize.go
+++ b/src/token/Tokenize.go
@ -0,0 +1,112 @@
+package token
+
+import "git.akyoto.dev/cli/q/src/keywords"
+
+// Pre-allocate these byte buffers so we can re-use them
+// instead of allocating a new buffer every time.
+var (
+	groupStartBytes = []byte{'('}
+	groupEndBytes   = []byte{')'}
+	blockStartBytes = []byte{'{'}
+	blockEndBytes   = []byte{'}'}
+	arrayStartBytes = []byte{'['}
+	arrayEndBytes   = []byte{']'}
+	separatorBytes  = []byte{','}
+	newLineBytes    = []byte{'\n'}
+)
+
+// Tokenize turns the file contents into a list of tokens.
+func Tokenize(buffer []byte) List {
+	var (
+		i      int
+		c      byte
+		tokens = make(List, 0, len(buffer)/2)
+	)
+
+	for i < len(buffer) {
+		c = buffer[i]
+
+		switch {
+		// Identifiers
+		case isIdentifierStart(c):
+			position := i
+			i++
+
+			for i < len(buffer) && isIdentifier(buffer[i]) {
+				i++
+			}
+
+			token := Token{
+				Identifier,
+				position,
+				buffer[position:i],
+			}
+
+			if keywords.All[string(token.Bytes)] {
+				token.Kind = Keyword
+			}
+
+			tokens = append(tokens, token)
+			i--
+
+		// Texts
+		case c == '"':
+			i++
+			position := i
+
+			for i < len(buffer) && buffer[i] != '"' {
+				i++
+			}
+
+			tokens = append(tokens, Token{
+				Text,
+				position,
+				buffer[position:i],
+			})
+
+		// Parentheses start
+		case c == '(':
+			tokens = append(tokens, Token{GroupStart, i, groupStartBytes})
+
+		// Parentheses end
+		case c == ')':
+			tokens = append(tokens, Token{GroupEnd, i, groupEndBytes})
+
+		// Block start
+		case c == '{':
+			tokens = append(tokens, Token{BlockStart, i, blockStartBytes})
+
+		// Block end
+		case c == '}':
+			tokens = append(tokens, Token{BlockEnd, i, blockEndBytes})
+
+		// Array start
+		case c == '[':
+			tokens = append(tokens, Token{ArrayStart, i, arrayStartBytes})
+
+		// Array end
+		case c == ']':
+			tokens = append(tokens, Token{ArrayEnd, i, arrayEndBytes})
+
+		// Separator
+		case c == ',':
+			tokens = append(tokens, Token{Separator, i, separatorBytes})
+
+		// New line
+		case c == '\n':
+			tokens = append(tokens, Token{NewLine, i, newLineBytes})
+		}
+
+		i++
+	}
+
+	return tokens
+}
+
+func isIdentifierStart(c byte) bool {
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
+}
+
+func isIdentifier(c byte) bool {
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= '0' && c <= '9')
+}