Reorganized file structure

2024-06-10 15:51:39 +02:00
parent c7354b8613
commit 6fe30f31da
57 changed files with 431 additions and 614 deletions
--- a/src/build/token/Keywords.go
+++ b/src/build/token/Keywords.go
@ -0,0 +1,6 @@
+package token
+
+// Keywords defines the keywords used in the language.
+var Keywords = map[string]bool{
+	"return": true,
+}
--- a/src/build/token/Kind.go
+++ b/src/build/token/Kind.go
@ -0,0 +1,72 @@
+package token
+
+// Kind represents the type of token.
+type Kind uint8
+
+const (
+	// Invalid represents an invalid token.
+	Invalid Kind = iota
+
+	// NewLine represents the newline character.
+	NewLine
+
+	// Identifier represents a series of characters used to identify a variable or function.
+	Identifier
+
+	// Keyword represents a language keyword.
+	Keyword
+
+	// String represents an uninterpreted series of characters in the source code.
+	String
+
+	// Number represents a series of numerical characters.
+	Number
+
+	// Operator represents a mathematical operator.
+	Operator
+
+	// Separator represents a comma.
+	Separator
+
+	// Comment represents a comment.
+	Comment
+
+	// GroupStart represents '('.
+	GroupStart
+
+	// GroupEnd represents ')'.
+	GroupEnd
+
+	// BlockStart represents '{'.
+	BlockStart
+
+	// BlockEnd represents '}'.
+	BlockEnd
+
+	// ArrayStart represents '['.
+	ArrayStart
+
+	// ArrayEnd represents ']'.
+	ArrayEnd
+)
+
+// String returns the text representation.
+func (kind Kind) String() string {
+	return [...]string{
+		"Invalid",
+		"NewLine",
+		"Identifier",
+		"Keyword",
+		"String",
+		"Number",
+		"Operator",
+		"Separator",
+		"Comment",
+		"GroupStart",
+		"GroupEnd",
+		"BlockStart",
+		"BlockEnd",
+		"ArrayStart",
+		"ArrayEnd",
+	}[kind]
+}
--- a/src/build/token/List.go
+++ b/src/build/token/List.go
@ -0,0 +1,25 @@
+package token
+
+import (
+	"bytes"
+)
+
+// List is a slice of tokens.
+type List []Token
+
+// String implements string serialization.
+func (list List) String() string {
+	builder := bytes.Buffer{}
+	var last Token
+
+	for _, t := range list {
+		if t.Kind == Identifier && last.Kind == Separator {
+			builder.WriteByte(' ')
+		}
+
+		builder.Write(t.Bytes)
+		last = t
+	}
+
+	return builder.String()
+}
--- a/src/build/token/Token.go
+++ b/src/build/token/Token.go
@ -0,0 +1,15 @@
+package token
+
+// Token represents a single element in a source file.
+// The characters that make up an identifier are grouped into a single token.
+// This makes parsing easier and allows us to do better syntax checks.
+type Token struct {
+	Kind     Kind
+	Position int
+	Bytes    []byte
+}
+
+// Text returns the token text.
+func (t Token) Text() string {
+	return string(t.Bytes)
+}
--- a/src/build/token/Token_test.go
+++ b/src/build/token/Token_test.go
@ -0,0 +1,212 @@
+package token_test
+
+import (
+	"testing"
+
+	"git.akyoto.dev/cli/q/src/build/token"
+	"git.akyoto.dev/go/assert"
+)
+
+func TestFunction(t *testing.T) {
+	tokens := token.Tokenize([]byte("main(){}"))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("main"),
+			Position: 0,
+		},
+		{
+			Kind:     token.GroupStart,
+			Bytes:    []byte("("),
+			Position: 4,
+		},
+		{
+			Kind:     token.GroupEnd,
+			Bytes:    []byte(")"),
+			Position: 5,
+		},
+		{
+			Kind:     token.BlockStart,
+			Bytes:    []byte("{"),
+			Position: 6,
+		},
+		{
+			Kind:     token.BlockEnd,
+			Bytes:    []byte("}"),
+			Position: 7,
+		},
+	})
+}
+
+func TestKeyword(t *testing.T) {
+	tokens := token.Tokenize([]byte("return x"))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.Keyword,
+			Bytes:    []byte("return"),
+			Position: 0,
+		},
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("x"),
+			Position: 7,
+		},
+	})
+}
+
+func TestArray(t *testing.T) {
+	tokens := token.Tokenize([]byte("array[i]"))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("array"),
+			Position: 0,
+		},
+		{
+			Kind:     token.ArrayStart,
+			Bytes:    []byte("["),
+			Position: 5,
+		},
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("i"),
+			Position: 6,
+		},
+		{
+			Kind:     token.ArrayEnd,
+			Bytes:    []byte("]"),
+			Position: 7,
+		},
+	})
+}
+
+func TestNewline(t *testing.T) {
+	tokens := token.Tokenize([]byte("\n\n"))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.NewLine,
+			Bytes:    []byte("\n"),
+			Position: 0,
+		},
+		{
+			Kind:     token.NewLine,
+			Bytes:    []byte("\n"),
+			Position: 1,
+		},
+	})
+}
+
+func TestNumber(t *testing.T) {
+	tokens := token.Tokenize([]byte(`123 -456`))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.Number,
+			Bytes:    []byte("123"),
+			Position: 0,
+		},
+		{
+			Kind:     token.Number,
+			Bytes:    []byte("-456"),
+			Position: 4,
+		},
+	})
+}
+
+func TestSeparator(t *testing.T) {
+	tokens := token.Tokenize([]byte("a,b,c"))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("a"),
+			Position: 0,
+		},
+		{
+			Kind:     token.Separator,
+			Bytes:    []byte(","),
+			Position: 1,
+		},
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("b"),
+			Position: 2,
+		},
+		{
+			Kind:     token.Separator,
+			Bytes:    []byte(","),
+			Position: 3,
+		},
+		{
+			Kind:     token.Identifier,
+			Bytes:    []byte("c"),
+			Position: 4,
+		},
+	})
+}
+
+func TestString(t *testing.T) {
+	tokens := token.Tokenize([]byte(`"Hello" "World"`))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.String,
+			Bytes:    []byte(`"Hello"`),
+			Position: 0,
+		},
+		{
+			Kind:     token.String,
+			Bytes:    []byte(`"World"`),
+			Position: 8,
+		},
+	})
+}
+
+func TestStringMultiline(t *testing.T) {
+	tokens := token.Tokenize([]byte("\"Hello\nWorld\""))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.String,
+			Bytes:    []byte("\"Hello\nWorld\""),
+			Position: 0,
+		},
+	})
+}
+
+func TestStringEOF(t *testing.T) {
+	tokens := token.Tokenize([]byte(`"EOF`))
+	assert.DeepEqual(t, tokens, token.List{
+		{
+			Kind:     token.String,
+			Bytes:    []byte(`"EOF`),
+			Position: 0,
+		},
+	})
+}
+
+func TestTokenText(t *testing.T) {
+	hello := token.Token{Kind: token.Identifier, Bytes: []byte("hello"), Position: 0}
+	comma := token.Token{Kind: token.Separator, Bytes: []byte(","), Position: 5}
+	world := token.Token{Kind: token.Identifier, Bytes: []byte("world"), Position: 7}
+
+	assert.Equal(t, hello.Text(), "hello")
+	assert.Equal(t, world.Text(), "world")
+
+	list := token.List{hello, comma, world}
+	assert.Equal(t, list.String(), "hello, world")
+}
+
+func TestTokenKind(t *testing.T) {
+	assert.Equal(t, token.Invalid.String(), "Invalid")
+	assert.Equal(t, token.NewLine.String(), "NewLine")
+	assert.Equal(t, token.Identifier.String(), "Identifier")
+	assert.Equal(t, token.Keyword.String(), "Keyword")
+	assert.Equal(t, token.String.String(), "String")
+	assert.Equal(t, token.Number.String(), "Number")
+	assert.Equal(t, token.Operator.String(), "Operator")
+	assert.Equal(t, token.Separator.String(), "Separator")
+	assert.Equal(t, token.Comment.String(), "Comment")
+	assert.Equal(t, token.GroupStart.String(), "GroupStart")
+	assert.Equal(t, token.GroupEnd.String(), "GroupEnd")
+	assert.Equal(t, token.BlockStart.String(), "BlockStart")
+	assert.Equal(t, token.BlockEnd.String(), "BlockEnd")
+	assert.Equal(t, token.ArrayStart.String(), "ArrayStart")
+	assert.Equal(t, token.ArrayEnd.String(), "ArrayEnd")
+}
--- a/src/build/token/Tokenize.go
+++ b/src/build/token/Tokenize.go
@ -0,0 +1,145 @@
+package token
+
+// Pre-allocate these byte buffers so we can re-use them
+// instead of allocating a new buffer every time.
+var (
+	groupStartBytes = []byte{'('}
+	groupEndBytes   = []byte{')'}
+	blockStartBytes = []byte{'{'}
+	blockEndBytes   = []byte{'}'}
+	arrayStartBytes = []byte{'['}
+	arrayEndBytes   = []byte{']'}
+	separatorBytes  = []byte{','}
+	newLineBytes    = []byte{'\n'}
+)
+
+// Tokenize turns the file contents into a list of tokens.
+func Tokenize(buffer []byte) List {
+	var (
+		i      int
+		tokens = make(List, 0, len(buffer)/2)
+	)
+
+	for i < len(buffer) {
+		switch buffer[i] {
+		// Texts
+		case '"':
+			start := i
+			end := len(buffer)
+			i++
+
+			for i < len(buffer) {
+				if buffer[i] == '"' {
+					end = i + 1
+					break
+				}
+
+				i++
+			}
+
+			tokens = append(tokens, Token{
+				String,
+				start,
+				buffer[start:end],
+			})
+
+		// Parentheses start
+		case '(':
+			tokens = append(tokens, Token{GroupStart, i, groupStartBytes})
+
+		// Parentheses end
+		case ')':
+			tokens = append(tokens, Token{GroupEnd, i, groupEndBytes})
+
+		// Block start
+		case '{':
+			tokens = append(tokens, Token{BlockStart, i, blockStartBytes})
+
+		// Block end
+		case '}':
+			tokens = append(tokens, Token{BlockEnd, i, blockEndBytes})
+
+		// Array start
+		case '[':
+			tokens = append(tokens, Token{ArrayStart, i, arrayStartBytes})
+
+		// Array end
+		case ']':
+			tokens = append(tokens, Token{ArrayEnd, i, arrayEndBytes})
+
+		// Separator
+		case ',':
+			tokens = append(tokens, Token{Separator, i, separatorBytes})
+
+		// New line
+		case '\n':
+			tokens = append(tokens, Token{NewLine, i, newLineBytes})
+
+		default:
+			// Identifiers
+			if isIdentifierStart(buffer[i]) {
+				position := i
+				i++
+
+				for i < len(buffer) && isIdentifier(buffer[i]) {
+					i++
+				}
+
+				token := Token{
+					Identifier,
+					position,
+					buffer[position:i],
+				}
+
+				if Keywords[string(token.Bytes)] {
+					token.Kind = Keyword
+				}
+
+				tokens = append(tokens, token)
+				continue
+			}
+
+			// Numbers
+			if isNumberStart(buffer[i]) {
+				position := i
+				i++
+
+				for i < len(buffer) && isNumber(buffer[i]) {
+					i++
+				}
+
+				tokens = append(tokens, Token{
+					Number,
+					position,
+					buffer[position:i],
+				})
+
+				continue
+			}
+		}
+
+		i++
+	}
+
+	return tokens
+}
+
+func isIdentifier(c byte) bool {
+	return isLetter(c) || isNumber(c) || c == '_'
+}
+
+func isIdentifierStart(c byte) bool {
+	return isLetter(c) || c == '_'
+}
+
+func isLetter(c byte) bool {
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+}
+
+func isNumber(c byte) bool {
+	return (c >= '0' && c <= '9')
+}
+
+func isNumberStart(c byte) bool {
+	return isNumber(c) || c == '-'
+}