feat: remove stb_c_lexer

This commit is contained in:
2026-05-22 00:24:32 +02:00
parent 37f32fa280
commit 3736df7249
8 changed files with 717 additions and 1233 deletions

486
cmeta.h
View File

@@ -19,20 +19,23 @@ typedef struct {
#ifdef CMETA_COMPTIME
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#define STB_C_LEXER_IMPLEMENTATION
#include "stb_c_lexer.h"
stb_lexer lexer = {0};
#include <stdlib.h>
#include <linux/limits.h>
typedef struct {
char* type;
char* name;
const char* data;
size_t len;
} String_View;
typedef struct {
String_View type;
String_View name;
} Parsed_Field_Info;
typedef struct {
@@ -42,9 +45,8 @@ typedef struct {
} Parsed_Field_Infos;
typedef struct {
char* name;
size_t fields_count;
Parsed_Field_Info *fields;
String_View name;
Parsed_Field_Infos fields;
} Parsed_Struct_Info;
typedef struct {
@@ -94,138 +96,347 @@ void sb_append_ch(String_Builder* sb, char ch) {
(da)->items[(da)->count++] = (item); \
} while (0) \
bool lexer_expect_keyword(const char* expected) {
stb_c_lexer_get_token(&lexer);
#define SV_FMT "%.*s"
#define SV_ARG(sv) (int) (sv).len, (sv).data
#define SV_EMPTY ((String_View) {0})
if (lexer.token != CLEX_id) {
// TODO: map lexer.token to readable name
fprintf(stderr, "ERROR: expected `%s` but got `%ld`\n", expected, lexer.token);
return false;
}
String_View make_sv_from_cstr(const char* cstr) {
return (String_View) {
.data = cstr,
.len = strlen(cstr),
};
}
if (strcmp(lexer.string, expected) != 0) {
fprintf(stderr, "ERROR: expected `%s` but got `%s`\n", expected, lexer.string);
return false;
bool sv_starts_with(String_View sv, String_View prefix) {
if (prefix.len > sv.len) return false;
for (size_t i = 0; i < prefix.len; i += 1) {
if (sv.data[i] != prefix.data[i]) return false;
}
return true;
}
bool lexer_expect(long expected, const char* expected_str) {
stb_c_lexer_get_token(&lexer);
bool sv_starts_with_cstr(String_View sv, char* prefix) {
return sv_starts_with(sv, make_sv_from_cstr(prefix));
}
if (lexer.token != expected) {
// TODO: map lexer.token to readable name
if(expected_str != NULL) {
fprintf(stderr, "ERROR: expected %s but got `%ld`\n", expected_str, lexer.token);
} else {
fprintf(stderr, "ERROR: expected `%ld` but got `%ld`\n", expected, lexer.token);
}
return false;
bool sv_ends_with(String_View sv, String_View suffix) {
if (suffix.len > sv.len) return false;
for (size_t i = 0; i < suffix.len; i += 1) {
if (sv.data[sv.len - i - 1] != suffix.data[suffix.len - i - 1]) return false;
}
return true;
}
long lexer_peek() {
char* mark = lexer.parse_point;
if (!stb_c_lexer_get_token(&lexer)) {
lexer.parse_point = mark;
return CLEX_eof;
bool sv_ends_with_cstr(String_View sv, char* suffix) {
return sv_ends_with(sv, make_sv_from_cstr(suffix));
}
String_View sv_sub(String_View sv, size_t start, size_t end) {
if (sv.len == 0) return SV_EMPTY;
if (start >= end) return SV_EMPTY;
if (start >= sv.len) return SV_EMPTY;
if (end > sv.len) end = sv.len;
return (String_View) {
.data = sv.data + start,
.len = end - start,
};
}
String_View sv_trim_left(String_View sv) {
size_t start = 0;
while (start < sv.len && isspace(sv.data[start])) {
start += 1;
}
long token = lexer.token;
lexer.parse_point = mark;
return sv_sub(sv, start, sv.len);
}
String_View sv_trim_right(String_View sv) {
size_t end = sv.len - 1;
while(end > 0 && isspace(sv.data[end])) {
end -= 1;
}
return sv_sub(sv, 0, end + 1);
}
String_View sv_trim(String_View sv) {
return sv_trim_right(sv_trim_left(sv));
}
String_View sv_copy(String_View sv) {
return sv;
}
String_View sv_chop_by_delim(String_View* sv, char delimiter) {
size_t i = 0;
while (i < sv->len && sv->data[i] != delimiter) {
i += 1;
}
String_View chopped = sv_sub(*sv, 0, i);
*sv = sv_sub(*sv, i + (sv->data[i] == delimiter), sv->len);
return chopped;
}
String_View sv_chop_while(String_View* sv, bool (*predicate)(char c)) {
size_t i = 0;
while (i < sv->len && predicate(sv->data[i])) {
i += 1;
}
String_View chopped = sv_sub(*sv, 0, i);
*sv = sv_sub(*sv, i, sv->len);
return chopped;
}
String_View sv_shift(String_View* sv, int by) {
if (sv->len == 0) return SV_EMPTY;
String_View res = sv_sub(*sv, 0, by);
*sv = sv_sub(*sv, by, sv->len);
return res;
}
bool sv_eq(String_View a, String_View b) {
if (a.len != b.len) return false;
for (size_t i = 0; i < a.len; i += 1) {
if (a.data[i] != b.data[i]) return false;
}
return true;
}
bool sv_eq_cstr(String_View a, const char* b) {
return sv_eq(a, make_sv_from_cstr(b));
}
void sv_dump(String_View sv) {
printf("data = \"" SV_FMT "\"\n", SV_ARG(sv));
printf("len = %zu\n", sv.len);
}
char* sv_to_string(String_View sv) {
char* text = (char*) malloc((sv.len + 1) * sizeof(char));
memcpy(text, sv.data, sv.len);
text[sv.len] = '\0';
return text;
}
typedef enum {
TOKEN_IDENT,
TOKEN_DQUOTE,
TOKEN_OPAREN,
TOKEN_CPAREN,
TOKEN_OCURLY,
TOKEN_CCURLY,
TOKEN_SEMI,
TOKEN_STAR,
TOKEN_IGNORED,
TOKEN_EOF,
__token_kind_count,
} Token_Kind;
const char* token_kind_to_str(Token_Kind token) {
switch (token) {
case TOKEN_IDENT: return "identifier";
case TOKEN_DQUOTE: return "\"";
case TOKEN_OPAREN: return "(";
case TOKEN_CPAREN: return ")";
case TOKEN_OCURLY: return "{";
case TOKEN_CCURLY: return "}";
case TOKEN_SEMI: return ";";
case TOKEN_STAR: return "*";
case TOKEN_IGNORED: return "ignored";
case TOKEN_EOF: return "EOF";
default: assert(false && "Unreachable");
}
static_assert(__token_kind_count == 10, "Update the token_kind_to_str table");
}
typedef struct {
Token_Kind kind;
String_View text;
} Token;
typedef struct {
String_View text;
} Lexer;
Lexer make_lexer(String_View text) {
return (Lexer) {
.text = text,
};
}
bool is_valid_ident_char_at(String_View sv, size_t i) {
assert(i < sv.len && "Accessing char outside of sv");
char c = sv.data[i];
if (i == 0) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '$';
}
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
}
Token lexer_make_token(Lexer* lexer, Token_Kind kind, size_t text_end) {
Token token = {
.kind = kind,
.text = sv_sub(lexer->text, 0, text_end),
};
sv_shift(&lexer->text, text_end);
return token;
}
bool lexer_next(Lexer* lexer, Token* token) {
lexer->text = sv_trim_left(lexer->text);
if (lexer->text.len == 0) {
*token = lexer_make_token(lexer, TOKEN_EOF, 0);
return true;
}
switch (lexer->text.data[0]) {
// TODO: while in string, keep calling next
case '"': *token = lexer_make_token(lexer, TOKEN_DQUOTE, 1); return true;
case '*': *token = lexer_make_token(lexer, TOKEN_STAR, 1); return true;
case '(': *token = lexer_make_token(lexer, TOKEN_OPAREN, 1); return true;
case ')': *token = lexer_make_token(lexer, TOKEN_CPAREN, 1); return true;
case '{': *token = lexer_make_token(lexer, TOKEN_OCURLY, 1); return true;
case '}': *token = lexer_make_token(lexer, TOKEN_CCURLY, 1); return true;
case ';': *token = lexer_make_token(lexer, TOKEN_SEMI, 1); return true;
default: {
if (is_valid_ident_char_at(lexer->text, 0)) {
size_t end = 0;
while(is_valid_ident_char_at(lexer->text, end)) {
end += 1;
}
*token = lexer_make_token(lexer, TOKEN_IDENT, end);
return true;
}
sv_shift(&lexer->text, 1);
return lexer_next(lexer, token);
}
}
}
bool lexer_peek(Lexer lexer, Token* token) {
return lexer_next(&lexer, token);
}
bool lexer_peek_expect(Lexer lexer, Token* token, Token_Kind token_kind) {
Token tok;
if (!lexer_peek(lexer, &tok)) return false;
if (token) *token = tok;
if (tok.kind != token_kind) {
fprintf(stderr, "ERROR: Expected `%s` but got `%s`\n", token_kind_to_str(token_kind), token_kind_to_str(tok.kind));
return false;
}
return true;
}
bool lexer_next_expect(Lexer* lexer, Token* token, Token_Kind token_kind) {
Token tok;
if (!lexer_next(lexer, &tok)) return false;
if (token) *token = tok;
if (tok.kind != token_kind) {
fprintf(stderr, "ERROR: Expected `%s` but got `%s`\n", token_kind_to_str(token_kind), token_kind_to_str(tok.kind));
return false;
}
return true;
}
// parses typedef struct { FIELDS } TYPE_NAME
bool parse_struct(Parsed_Struct_Info* info) {
bool result = false;
char* name = NULL;
bool parse_struct(Lexer* lexer, Parsed_Struct_Info* info) {
Token token;
Parsed_Field_Infos fields = {0};
String_Builder field = {0};
if (!lexer_expect_keyword("typedef")) goto fail;
if (!lexer_expect_keyword("struct")) goto fail;
if (!lexer_expect('{', NULL)) goto fail;
if (!lexer_next_expect(lexer, &token, TOKEN_IDENT)) return false;
if (!sv_eq_cstr(token.text, "struct")) {
fprintf(stderr, "ERROR: Expected `struct` but got `" SV_FMT "`\n", SV_ARG(token.text));
return false;
}
if (!lexer_next_expect(lexer, NULL, TOKEN_OCURLY)) return false;
while (true) {
char* mark = lexer.parse_point;
if (!stb_c_lexer_get_token(&lexer)) {
fprintf(stderr, "ERROR: expected struct fields but got EOF\n");
goto fail;
}
if (!lexer_peek(*lexer, &token)) return false;
if (token.kind == TOKEN_CCURLY) break;
if (lexer.token == '}') break;
lexer.parse_point = mark;
// TODO: keep peeking until we reach semi
field.length = 0;
while (stb_c_lexer_get_token(&lexer) && lexer.token != ';') {
if (lexer.token <= 255) {
// TODO: parse arrays
if(lexer.token == '[') goto fail;
String_View field_type_sv;
String_View field_name_sv;
const char* field_type_begin = token.text.data;
const char* field_type_end = NULL;
sb_append_ch(&field, (char)lexer.token);
sb_append_ch(&field, ' ');
} else {
// TODO: parse unions
if(strcmp(lexer.string, "union") == 0) goto fail;
while (true) {
if (!lexer_next(lexer, &token)) return false;
// TODO: parse attributes
sb_append(&field, lexer.string);
sb_append_ch(&field, ' ');
if (token.kind == TOKEN_IDENT) {
Token next_token;
if (!lexer_peek(*lexer, &next_token)) return false;
if (next_token.kind == TOKEN_SEMI) {
field_type_end = token.text.data;
field_name_sv = token.text;
break;
}
}
}
field.data[field.length - 1] = '\0';
char* last_space = strrchr(field.data, ' ');
char* field_name = strdup(last_space + 1);
field_type_sv = (String_View) {
.data = field_type_begin,
.len = field_type_end - field_type_begin,
};
field.data[last_space - field.data] = '\0';
Parsed_Field_Info field = {
.type = field_type_sv,
.name = field_name_sv,
};
da_append(&fields, field);
char* field_type = strdup(field.data);
da_append(&fields, ((Parsed_Field_Info) {
.type = field_type,
.name = field_name,
}));
if (!lexer_next_expect(lexer, NULL, TOKEN_SEMI)) return false;
}
if (!lexer_expect(CLEX_id, "type name")) goto fail;
name = strdup(lexer.string);
if (!lexer_next_expect(lexer, NULL, TOKEN_CCURLY)) return false;
info->name = name;
info->fields_count = fields.count;
info->fields = (Parsed_Field_Info*)calloc(info->fields_count, sizeof(Parsed_Field_Info));
for(size_t i = 0; i < info->fields_count; i += 1) {
info->fields[i].type = fields.items[i].type;
info->fields[i].name = fields.items[i].name;
Token type_name_token;
if (!lexer_next_expect(lexer, &type_name_token, TOKEN_IDENT)) return false;
if (!lexer_next_expect(lexer, NULL, TOKEN_SEMI)) return false;
*info = (Parsed_Struct_Info) {
.name = type_name_token.text,
.fields = fields,
};
return true;
}
// TODO: it should accept Type_Info instead
bool parse_typedef(Lexer* lexer, Parsed_Struct_Info* info) {
Token token;
if (!lexer_peek(*lexer, &token)) return false;
if (token.kind == TOKEN_IDENT && sv_eq_cstr(token.text, "struct")) {
return parse_struct(lexer, info);
}
result = true;
fail:
free(field.data);
if(!result) {
free(name);
for(size_t i = 0; i < fields.count; i += 1) {
free(fields.items[i].name);
free(fields.items[i].type);
}
free(fields.items);
}
return result;
fprintf(stderr, "ERROR: Only parsing of `typedef struct {...} T` is implemented for now, got `" SV_FMT "`\n", SV_ARG(token.text));
return false;
}
void print_struct(Parsed_Struct_Info info) {
printf("struct_name = %s\n", info.name);
printf("fields[%zu] = [\n", info.fields_count);
for (size_t i = 0; i < info.fields_count; i += 1) {
printf(" { type = %s, name = %s },\n", info.fields[i].type, info.fields[i].name);
printf("struct_name = " SV_FMT "\n", SV_ARG(info.name));
printf("fields[%zu] = [\n", info.fields.count);
for (size_t i = 0; i < info.fields.count; i += 1) {
printf(" { type = " SV_FMT ", name = " SV_FMT " },\n", SV_ARG(info.fields.items[i].type), SV_ARG(info.fields.items[i].name));
}
printf("]\n");
}
@@ -244,14 +455,15 @@ char* to_lowercase(char* str) {
}
void generate_struct_info(FILE* stream, Parsed_Struct_Info info) {
char* lowercase_name = to_lowercase(strdup(info.name));
char* text = sv_to_string(info.name);
char* lowercase_name = to_lowercase(text);
gen("static Struct_Info %s_info = {", lowercase_name);
gen(" .name = \"%s\",", info.name);
gen(" .fields_count = %zu,", info.fields_count);
gen(" .fields = (Field_Info[%zu]) {", info.fields_count);
for (size_t i = 0; i < info.fields_count; i += 1) {
gen(" { .type = \"%s\", .name = \"%s\" },", info.fields[i].type, info.fields[i].name);
gen("Struct_Info %s_info = {", lowercase_name);
gen(" .name = \"" SV_FMT "\",", SV_ARG(info.name));
gen(" .fields_count = %zu,", info.fields.count);
gen(" .fields = (Field_Info[%zu]) {", info.fields.count);
for (size_t i = 0; i < info.fields.count; i += 1) {
gen(" { .type = \"" SV_FMT "\", .name = \"" SV_FMT "\" },", SV_ARG(info.fields.items[i].type), SV_ARG(info.fields.items[i].name));
}
gen(" },");
gen("};");
@@ -262,11 +474,12 @@ void generate_struct_info(FILE* stream, Parsed_Struct_Info info) {
bool read_entire_file(const char* file_path, char** content) {
bool result = false;
FILE* file = fopen(file_path, "rb");
long length = 0;
if(file == NULL) goto fail;
if(fseek(file, 0, SEEK_END) < 0) goto fail;
long length = ftell(file);
length = ftell(file);
if(length < 0) goto fail;
if(fseek(file, 0, SEEK_SET) < 0) goto fail;
@@ -294,30 +507,34 @@ bool generate_output_file(const char* output_path, Parsed_Struct_Infos struct_in
const size_t GENERATION_MARK_LEN = strlen(GENERATION_MARK);
bool result = false;
FILE* output_file = NULL;
FILE* stream = NULL;
char* generate_begin = NULL;
char* generate_end = NULL;
char* header_content = NULL;
char* header_content;
if (!read_entire_file(__FILE__, &header_content)) goto fail;
// 1. find BEGIN an END
char* generate_begin = strstr(header_content, GENERATION_MARK);
generate_begin = strstr(header_content, GENERATION_MARK);
if (generate_begin == NULL) {
fprintf(stderr, "ERROR: could not found generation mark in cmeta.h\n");
goto fail;
}
char* generate_end = strstr(generate_begin + GENERATION_MARK_LEN, GENERATION_MARK);
generate_end = strstr(generate_begin + GENERATION_MARK_LEN, GENERATION_MARK);
if (generate_end == NULL) {
fprintf(stderr, "ERROR: could not found generation mark in cmeta.h\n");
goto fail;
}
FILE* output_file = fopen(output_path, "wb");
output_file = fopen(output_path, "wb");
if (!output_file) {
fprintf(stderr, "ERROR: could not write to %s: %s\n", output_path, strerror(errno));
goto fail;
}
FILE* stream = output_file;
stream = output_file;
// write up to the generation mark, including it
fwrite(header_content, generate_begin + GENERATION_MARK_LEN - header_content, 1, stream);
@@ -337,7 +554,7 @@ fail:
}
bool preprocess_file(const char* file_path, String_Builder* result) {
char command[256] = {0};
char command[PATH_MAX + 16] = {0};
sprintf(command, "gcc -E %s", file_path);
FILE* fp = popen(command, "r");
@@ -346,29 +563,25 @@ bool preprocess_file(const char* file_path, String_Builder* result) {
return false;
}
char line[512];
char line[PATH_MAX + 64];
size_t line_num = 0;
char file_name[512];
char file_name[PATH_MAX];
size_t cursor = 0;
result->length = 0;
// NOTE: it currently only gets the code of the file, without including
// headers becauses it's easier to parse for now
bool collecting_content = false;
while (fgets(line, sizeof(line), fp) != NULL) {
if (sscanf(line, "# %zu \"%s\"", &line_num, file_name) == 2) {
if (sscanf(line, "# %zu \"%4095s\"", &line_num, file_name) == 2) {
// remove trailing "
file_name[strlen(file_name) - 1] = '\0';
if (strcmp(file_name, file_path) == 0) {
collecting_content = true;
continue;
}
collecting_content = strcmp(file_name, file_path) == 0;
// TODO: read original file at line_num, to check for comments (e.g annotations)
} else if(collecting_content) {
sb_append(result, line);
cursor += strlen(line);
}
}
@@ -382,26 +595,21 @@ bool process_file(const char* input_file) {
String_Builder input_content = {0};
if (!preprocess_file(input_file, &input_content)) return false;
// init lexer
char string_store[1024] = {0};
stb_c_lexer_init(&lexer, input_content.data, input_content.data + input_content.length, string_store, sizeof(string_store) / sizeof(char));
// find and parse all structs
Parsed_Struct_Infos struct_infos = {0};
String_View sv = make_sv_from_cstr(input_content.data);
Lexer lexer = make_lexer(sv);
while (true) {
char* mark = lexer.parse_point;
if (!stb_c_lexer_get_token(&lexer)) break;
if (lexer.token == CLEX_id && strcmp(lexer.string, "typedef") == 0) {
lexer.parse_point = mark;
Token token;
do {
if (!lexer_next(&lexer, &token)) return false;
if (token.kind == TOKEN_IDENT && sv_eq_cstr(token.text, "typedef")) {
Parsed_Struct_Info struct_info = {0};
if (parse_struct(&struct_info)) {
da_append(&struct_infos, struct_info);
}
if (!parse_typedef(&lexer, &struct_info)) return false;
da_append(&struct_infos, struct_info);
}
}
while (token.kind != TOKEN_EOF);
if (!generate_output_file(__FILE__, struct_infos)) return false;