Budowa prostego interpretera języka skryptowego w C (GCC)

Obraz ilustrujacy Budowa prostego interpretera jzyka skryptowego w C GCC

Wprowadzenie

W tym artykule przedstawiono implementację prostego interpretera języka skryptowego napisanego w języku C. Projekt pokazuje klasyczną architekturę:

kod źródłowy → lexer → parser → AST → interpreter

Celem jest zrozumienie fundamentów działania języków programowania bez użycia zewnętrznych narzędzi.

1. Założenia języka

Zaimplementowany język obsługuje:

zmienne (let)
operacje arytmetyczne (+ - * /)
porównania (< > ==)
instrukcję print
instrukcje sterujące:
- if
- while
bloki {}

Przykład programu

let x = 0;

while x < 5 {
    print x;
    let x = x + 1;
}

2. Analiza leksykalna (Lexer)

Lexer przekształca tekst wejściowy w strumień tokenów.

Typy tokenów

TOKEN_NUMBER, TOKEN_IDENT,
TOKEN_LET, TOKEN_PRINT,
TOKEN_IF, TOKEN_WHILE,
...

Kluczowe elementy

Pomijanie białych znaków

while (isspace(src[pos])) pos++;

Rozpoznawanie liczb

while (isdigit(src[pos]))

Rozpoznawanie identyfikatorów i słów kluczowych

if (!strcmp(t.value, "let")) t.type = TOKEN_LET;

Operatory i symbole

Obsługiwane są m.in.:

+ - * /
==
{ } ( )

3. Parser i AST

Parser buduje drzewo składniowe (AST).

Typy węzłów

NODE_NUMBER
NODE_VAR
NODE_BINARY
NODE_ASSIGN
NODE_PRINT
NODE_BLOCK
NODE_IF
NODE_WHILE

4. Parsowanie wyrażeń

Parser implementuje priorytety operatorów:

expression → comparison
comparison → arithmetic ( < > == )
arithmetic → term ( + - )
term → factor ( * / )
factor → liczba | zmienna | (wyrażenie)

Dzięki temu:

2 + 3 * 4 = 14

5. Parsowanie instrukcji

Obsługiwane konstrukcje:

Deklaracja zmiennej

let x = 5;

Wypisanie

print x;

Warunek

if x < 10 {
    print x;
}

Pętla

while x < 10 {
    ...
}

6. Struktura AST

AST jest reprezentowane jako struktura z unią:

struct AST {
    NodeType type;

    union {
        int number;
        char name[64];
        ...
    };
};

To pozwala na przechowywanie różnych typów danych w jednym węźle.

7. Interpreter

Interpreter wykonuje AST.

Przechowywanie zmiennych

Lista jednokierunkowa:

typedef struct Var {
    char name[64];
    int value;
    struct Var* next;
} Var;

Operacje na zmiennych

Ustawienie:

set_var(name, value)

Odczyt:

get_var(name)

8. Ewaluacja wyrażeń

Funkcja eval() oblicza wartość węzłów:

case NODE_BINARY:
    int l = eval(left);
    int r = eval(right);

Obsługiwane operatory:

+ - * /
< >
==

9. Wykonywanie instrukcji

Funkcja exec():

Przypisanie

set_var(...)

Print

printf("%d\n", ...)

Blok

for (...) exec(...)

If

if (condition) ...

While

while (condition) ...

10. Scope (ważna decyzja projektowa)

Bloki {} nie tworzą nowego scope.

Konsekwencje:

prostsza implementacja
brak shadowingu zmiennych
łatwiejsze debugowanie

Przykład:

let x = 0;

while x < 3 {
    let x = x + 1;  // nadpisuje x
}

11. Wczytywanie pliku

Program przyjmuje plik jako argument:

./minilang program.txt

Implementacja:

fopen → fread → buffer

12. Architektura programu

lexer      → tokeny
parser     → AST
interpreter→ wykonanie

Każda warstwa jest wyraźnie oddzielona.

13. Ograniczenia

Aktualna wersja:

brak else
brak funkcji
brak typów innych niż int
brak zarządzania pamięcią (brak free)

14. Możliwe rozszerzenia

średni poziom

else
funkcje (fn)
scope lokalny

zaawansowane

VM (bytecode)
garbage collector
stringi i tablice

15. Wnioski

Projekt pokazuje:

jak działa parser rekurencyjny
jak buduje się AST
jak interpreter wykonuje kod
jak działa prosty język programowania

Jest to solidna baza do budowy bardziej zaawansowanych systemów.

16. Gotowy program

Kod źródłowy `minilang.c`

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* ================= TOKENS ================= */

typedef enum {
    TOKEN_NUMBER, TOKEN_IDENT,
    TOKEN_LET, TOKEN_PRINT,
    TOKEN_IF, TOKEN_WHILE,

    TOKEN_PLUS, TOKEN_MINUS, TOKEN_STAR, TOKEN_SLASH,
    TOKEN_EQ, TOKEN_EQEQ,
    TOKEN_LT, TOKEN_GT,

    TOKEN_LPAREN, TOKEN_RPAREN,
    TOKEN_LBRACE, TOKEN_RBRACE,
    TOKEN_SEMI,

    TOKEN_EOF
} TokenType;

typedef struct {
    TokenType type;
    char value[64];
} Token;

/* ================= GLOBAL ================= */

char* src;
int pos = 0;
Token current_token;

/* ================= LEXER ================= */

void skip_ws() {
    while (isspace(src[pos])) pos++;
}

int is_alpha(char c) {
    return isalpha(c) || c == '_';
}

Token next_token() {
    skip_ws();

    Token t;
    t.value[0] = '\0';

    char c = src[pos];

    if (c == '\0') {
        t.type = TOKEN_EOF;
        return t;
    }

    if (isdigit(c)) {
        int i = 0;
        while (isdigit(src[pos]))
            t.value[i++] = src[pos++];
        t.value[i] = '\0';
        t.type = TOKEN_NUMBER;
        return t;
    }

    if (is_alpha(c)) {
        int i = 0;
        while (is_alpha(src[pos]))
            t.value[i++] = src[pos++];
        t.value[i] = '\0';

        if (!strcmp(t.value, "let")) t.type = TOKEN_LET;
        else if (!strcmp(t.value, "print")) t.type = TOKEN_PRINT;
        else if (!strcmp(t.value, "if")) t.type = TOKEN_IF;
        else if (!strcmp(t.value, "while")) t.type = TOKEN_WHILE;
        else t.type = TOKEN_IDENT;

        return t;
    }

    pos++;

    switch (c) {
        case '+': t.type = TOKEN_PLUS; break;
        case '-': t.type = TOKEN_MINUS; break;
        case '*': t.type = TOKEN_STAR; break;
        case '/': t.type = TOKEN_SLASH; break;
        case '<': t.type = TOKEN_LT; break;
        case '>': t.type = TOKEN_GT; break;

        case '=':
            if (src[pos] == '=') {
                pos++;
                t.type = TOKEN_EQEQ;
            } else t.type = TOKEN_EQ;
            break;

        case '(': t.type = TOKEN_LPAREN; break;
        case ')': t.type = TOKEN_RPAREN; break;
        case '{': t.type = TOKEN_LBRACE; break;
        case '}': t.type = TOKEN_RBRACE; break;
        case ';': t.type = TOKEN_SEMI; break;

        default:
            printf("Unknown char: %c\n", c);
            exit(1);
    }

    return t;
}

void advance() {
    current_token = next_token();
}

/* ================= AST ================= */

typedef enum {
    NODE_NUMBER, NODE_VAR, NODE_BINARY,
    NODE_ASSIGN, NODE_PRINT,
    NODE_BLOCK, NODE_IF, NODE_WHILE
} NodeType;

typedef struct AST AST;

struct AST {
    NodeType type;

    union {
        int number;
        char name[64];

        struct {
            AST* left;
            AST* right;
            char op;
        } binary;

        struct {
            char name[64];
            AST* value;
        } assign;

        struct {
            AST* expr;
        } print;

        struct {
            AST** stmts;
            int count;
        } block;

        struct {
            AST* cond;
            AST* then_branch;
        } if_stmt;

        struct {
            AST* cond;
            AST* body;
        } while_stmt;
    };
};

/* ================= PARSER ================= */

AST* new_node(NodeType t) {
    AST* n = malloc(sizeof(AST));
    n->type = t;
    return n;
}

/* ---- expressions ---- */

AST* parse_expression();

AST* parse_factor() {
    if (current_token.type == TOKEN_NUMBER) {
        AST* n = new_node(NODE_NUMBER);
        n->number = atoi(current_token.value);
        advance();
        return n;
    }

    if (current_token.type == TOKEN_IDENT) {
        AST* n = new_node(NODE_VAR);
        strcpy(n->name, current_token.value);
        advance();
        return n;
    }

    if (current_token.type == TOKEN_LPAREN) {
        advance();
        AST* n = parse_expression();
        if (current_token.type != TOKEN_RPAREN) {
            printf("Expected )\n");
            exit(1);
        }
        advance();
        return n;
    }

    printf("Unexpected token\n");
    exit(1);
}

AST* parse_term() {
    AST* node = parse_factor();

    while (current_token.type == TOKEN_STAR ||
           current_token.type == TOKEN_SLASH) {

        char op = (current_token.type == TOKEN_STAR) ? '*' : '/';
        advance();

        AST* right = parse_factor();

        AST* n = new_node(NODE_BINARY);
        n->binary.left = node;
        n->binary.right = right;
        n->binary.op = op;
        node = n;
    }

    return node;
}

AST* parse_arith() {
    AST* node = parse_term();

    while (current_token.type == TOKEN_PLUS ||
           current_token.type == TOKEN_MINUS) {

        char op = (current_token.type == TOKEN_PLUS) ? '+' : '-';
        advance();

        AST* right = parse_term();

        AST* n = new_node(NODE_BINARY);
        n->binary.left = node;
        n->binary.right = right;
        n->binary.op = op;
        node = n;
    }

    return node;
}

AST* parse_expression() {
    AST* node = parse_arith();

    if (current_token.type == TOKEN_LT ||
        current_token.type == TOKEN_GT ||
        current_token.type == TOKEN_EQEQ) {

        char op;
        if (current_token.type == TOKEN_LT) op = '<';
        else if (current_token.type == TOKEN_GT) op = '>';
        else op = '=';

        advance();
        AST* right = parse_arith();

        AST* n = new_node(NODE_BINARY);
        n->binary.left = node;
        n->binary.right = right;
        n->binary.op = op;
        return n;
    }

    return node;
}

/* ---- statements ---- */

AST* parse_statement();

AST* parse_block() {
    AST* block = new_node(NODE_BLOCK);
    block->block.stmts = NULL;
    block->block.count = 0;

    if (current_token.type != TOKEN_LBRACE) {
        printf("Expected {\n");
        exit(1);
    }
    advance();

    while (current_token.type != TOKEN_RBRACE) {
        block->block.stmts = realloc(block->block.stmts,
            sizeof(AST*) * (block->block.count + 1));
        block->block.stmts[block->block.count++] = parse_statement();
    }

    advance();
    return block;
}

AST* parse_statement() {
    if (current_token.type == TOKEN_LET) {
        advance();

        AST* n = new_node(NODE_ASSIGN);
        strcpy(n->assign.name, current_token.value);
        advance();

        advance(); // =

        n->assign.value = parse_expression();

        advance(); // ;
        return n;
    }

    if (current_token.type == TOKEN_PRINT) {
        advance();

        AST* n = new_node(NODE_PRINT);
        n->print.expr = parse_expression();

        advance(); // ;
        return n;
    }

    if (current_token.type == TOKEN_IF) {
        advance();

        AST* n = new_node(NODE_IF);

        n->if_stmt.cond = parse_expression();
        n->if_stmt.then_branch = parse_block();

        return n;
    }

    if (current_token.type == TOKEN_WHILE) {
        advance();

        AST* n = new_node(NODE_WHILE);

        n->while_stmt.cond = parse_expression();
        n->while_stmt.body = parse_block();

        return n;
    }

    printf("Unknown statement\n");
    exit(1);
}

/* ================= INTERPRETER ================= */

typedef struct Var {
    char name[64];
    int value;
    struct Var* next;
} Var;

Var* vars = NULL;

void set_var(const char* name, int val) {
    Var* v = vars;
    while (v) {
        if (!strcmp(v->name, name)) {
            v->value = val;
            return;
        }
        v = v->next;
    }

    Var* n = malloc(sizeof(Var));
    strcpy(n->name, name);
    n->value = val;
    n->next = vars;
    vars = n;
}

int get_var(const char* name) {
    Var* v = vars;
    while (v) {
        if (!strcmp(v->name, name))
            return v->value;
        v = v->next;
    }
    printf("Undefined: %s\n", name);
    exit(1);
}

int eval(AST* n);

int eval(AST* n) {
    switch (n->type) {
        case NODE_NUMBER: return n->number;
        case NODE_VAR: return get_var(n->name);

        case NODE_BINARY: {
            int l = eval(n->binary.left);
            int r = eval(n->binary.right);

            switch (n->binary.op) {
                case '+': return l + r;
                case '-': return l - r;
                case '*': return l * r;
                case '/': return l / r;
                case '<': return l < r;
                case '>': return l > r;
                case '=': return l == r;
            }
        }
        default: return 0;
    }
}

void exec(AST* n);

void exec(AST* n) {
    switch (n->type) {
        case NODE_ASSIGN:
            set_var(n->assign.name, eval(n->assign.value));
            break;

        case NODE_PRINT:
            printf("%d\n", eval(n->print.expr));
            break;

        case NODE_BLOCK:
            for (int i = 0; i < n->block.count; i++)
                exec(n->block.stmts[i]);
            break;

        case NODE_IF:
            if (eval(n->if_stmt.cond))
                exec(n->if_stmt.then_branch);
            break;

        case NODE_WHILE:
            while (eval(n->while_stmt.cond))
                exec(n->while_stmt.body);
            break;

        default:
            break;
    }
}

/* ================= FILE ================= */

char* read_file(const char* path) {
    FILE* f = fopen(path, "rb");
    if (!f) {
        printf("Cannot open file\n");
        exit(1);
    }

    fseek(f, 0, SEEK_END);
    long size = ftell(f);
    rewind(f);

    char* buffer = malloc(size + 1);
    fread(buffer, 1, size, f);
    buffer[size] = '\0';

    fclose(f);
    return buffer;
}

/* ================= MAIN ================= */

int main(int argc, char** argv) {
    if (argc < 2) {
        printf("Usage: %s program.txt\n", argv[0]);
        return 1;
    }

    src = read_file(argv[1]);
    advance();

    while (current_token.type != TOKEN_EOF) {
        AST* stmt = parse_statement();
        exec(stmt);
    }

    return 0;
}

Przykładowy skrypt `test.ml`

let x = 0;

while x < 5 {
    print x;
    let x = x + 1;
}

if x == 5 {
    print 999;
}

Kompilacja i uruchomienie

gcc -o minilang minilang.c
sudo chmod +x ./minilang
./minilang test.ml

Wynik działania skryptu

Podsumowanie

Zaimplementowany interpreter to:

pełny pipeline kompilacji (bez kompilacji do kodu maszynowego)
działający język skryptowy
fundament pod własny język programowania

Najważniejsza wartość: zrozumienie, jak działają języki „od środka”.

1 kwietnia 2026 7