diff --git a/.clang-format b/.clang-format index c93cf11..089cc53 100644 --- a/.clang-format +++ b/.clang-format @@ -116,6 +116,9 @@ IfMacros: - KJ_IF_MAYBE IncludeBlocks: Regroup IncludeCategories: + - Regex: '^"jay/' + Priority: 1 + CaseSensitive: true - Regex: '^"jary/' Priority: 2 CaseSensitive: true diff --git a/include/jary/memory.h b/include/jary/memory.h index 89bee42..a753b7b 100644 --- a/include/jary/memory.h +++ b/include/jary/memory.h @@ -111,4 +111,14 @@ int sc_strfmt(struct sc_mem *alloc, char **str, const char *fmt, ...); int sc_reap(struct sc_mem *alloc, void *buf, free_t expire); void sc_free(struct sc_mem *alloc); +static inline struct sb_mem *sc_linbuf(struct sc_mem *alloc) +{ + void *ptr = sc_alloc(alloc, sizeof(struct sb_mem)); + + if (sc_reap(alloc, ptr, (free_t) sb_free)) + return 0; + + return (struct sb_mem *) ptr; +} + #endif // JAYVM_MEM_H diff --git a/lib/jay/CMakeLists.txt b/lib/jay/CMakeLists.txt index 1e6a8ef..35e0c28 100644 --- a/lib/jay/CMakeLists.txt +++ b/lib/jay/CMakeLists.txt @@ -51,22 +51,22 @@ if( CMAKE_C_COMPILER_ID MATCHES "^(Clang|GNU)$" ) endif() target_include_directories( scanner - PUBLIC + PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/lib/jay ) target_include_directories( parser - PUBLIC + PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/lib/jay ) target_include_directories( compiler - PUBLIC + PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/lib/jay ) target_include_directories( exec - PUBLIC + PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/lib/jay ) diff --git a/lib/jay/compiler.c b/lib/jay/compiler.c index f141003..1e3556e 100644 --- a/lib/jay/compiler.c +++ b/lib/jay/compiler.c @@ -1716,7 +1716,6 @@ static inline bool _rule_decl(const struct jy_asts *asts, // patch jumps to END for (uint32_t i = 0; i < patchsz; ++i) { uint32_t ofs = patchofs[i]; - // TODO: Handle long jump scenario short jmp = (short) (*ctx.codesz - ofs + 1); memcpy(*ctx.codes + ofs, &jmp, sizeof(jmp)); diff --git a/lib/jay/parser.c b/lib/jay/parser.c index 822bd8d..4c4a059 100644 --- a/lib/jay/parser.c +++ b/lib/jay/parser.c @@ -1444,15 +1444,15 @@ static bool _precedence(struct parser *p, goto PANIC; } - enum jy_tkn inftype = tkns->types[p->tkn]; - enum prec nextprec = tkn_prec(inftype); + enum jy_tkn inftype = tkns->types[p->tkn]; + enum prec lbp = tkn_prec(inftype); - while (rbp < nextprec) { + while (rbp < lbp) { if (_infix(inftype, p, asts, tkns, errs, root)) goto PANIC; - inftype = tkns->types[p->tkn]; - nextprec = tkn_prec(inftype); + inftype = tkns->types[p->tkn]; + lbp = tkn_prec(inftype); } return ended(tkns->types, tkns->size); diff --git a/lib/jay/regex.c b/lib/jay/regex.c new file mode 100644 index 0000000..a9415ab --- /dev/null +++ b/lib/jay/regex.c @@ -0,0 +1,672 @@ +#include "regex.h" + +#include +#include +#include + +#define AST_MAX UINT16_MAX + +enum { + RGX_NEXT = 1, + RGX_ENDED = 2, + RGX_OOM = 3, + RGX_INVARIANT = 5, +}; + +enum TKN { + TKN_LEFT_BRACKET, + TKN_RIGHT_BRACKET, + TKN_LEFT_PAREN, + TKN_RIGHT_PAREN, + TKN_LEFT_BRACE, + TKN_RIGHT_BRACE, + + TKN_DOT, + TKN_CARET, + TKN_QMARK, + TKN_VERTBAR, + TKN_BACKSLASH, + + TKN_PLUS, + TKN_STAR, + TKN_COMMA, + TKN_DOLLAR, + + TKN_ESCAPED, + + TKN_SINGLE, + + TKN_EOF, +}; + +enum OPCODE { + OP_CHAR, + OP_SPLIT16, + OP_SPLIT8, + OP_JMP16, +}; + +enum PREC { + PREC_NONE, + PREC_ALTERNATION, + PREC_CONCAT, + PREC_REPETITION, + PREC_LAST, +}; + +struct cmplr { + struct sb_mem *codebuf; + uint8_t *codes; + uint32_t codesz; + // last opcode index + uint32_t lastop; +}; + +struct parser { + const char *base; + const char *regex; + const char *lex; + char *errmsg; + enum TKN type; + uint32_t lexsz; + // current ast id + rgxast_t ast; +}; + +union codeview { + uint8_t *u8; + int8_t *i8; + uint16_t *u16; + int16_t *i16; + int32_t *i32; +}; + +static int mkast(enum RGXAST type, char c, struct rgxast *list, rgxast_t *ast) +{ + assert(list->size + 1 < AST_MAX); + + int ret = RGX_OK; + + if (list->size + 1 >= AST_MAX) + goto OUT_OF_MEMORY; + + jry_mem_push(list->type, list->size, type); + + if (list->type == NULL) + goto OUT_OF_MEMORY; + + jry_mem_push(list->c, list->size, c); + + if (list->c == NULL) + goto OUT_OF_MEMORY; + + jry_mem_push(list->child, list->size, NULL); + + if (list->child == NULL) + goto OUT_OF_MEMORY; + + jry_mem_push(list->childsz, list->size, 0); + + if (list->childsz == NULL) + goto OUT_OF_MEMORY; + + if (ast) + *ast = list->size; + + list->size += 1; + + goto FINISH; + +OUT_OF_MEMORY: + ret = RGX_OOM; + +FINISH: + return ret; +} + +static int addchild(rgxast_t ast, rgxast_t child, struct rgxast *list) +{ + assert(ast < list->size); + assert(child < list->size); + + rgxast_t *chsz = &list->childsz[ast]; + rgxast_t **children = &list->child[ast]; + + jry_mem_push(*children, *chsz, child); + + if (*children == NULL) + goto OUT_OF_MEMORY; + + *chsz += 1; + + return RGX_OK; + +OUT_OF_MEMORY: + return RGX_OOM; +} + +static inline int expr(enum PREC rbp, + struct sc_mem *alloc, + struct parser *p, + struct rgxast *asts, + rgxast_t *root); + +const char *tokenize(const char *start, enum TKN *type) +{ + const char *current = start; + + if (current[0] == '\0') { + *type = TKN_EOF; + goto FINISH; + } + + char c = *(current++); + + switch (c) { + case '(': + *type = TKN_LEFT_PAREN; + goto FINISH; + case ')': + *type = TKN_RIGHT_PAREN; + goto FINISH; + case '{': + *type = TKN_LEFT_BRACE; + goto FINISH; + case '}': + *type = TKN_RIGHT_BRACE; + goto FINISH; + case '+': + *type = TKN_PLUS; + goto FINISH; + case '*': + *type = TKN_STAR; + goto FINISH; + case '|': + *type = TKN_VERTBAR; + goto FINISH; + case '\\': + *type = TKN_ESCAPED; + goto FINISH; + case '^': + *type = TKN_CARET; + goto FINISH; + case '?': + *type = TKN_QMARK; + goto FINISH; + case '.': + *type = TKN_DOT; + goto FINISH; + case '[': + *type = TKN_LEFT_BRACKET; + goto FINISH; + case ']': + *type = TKN_RIGHT_BRACKET; + goto FINISH; + case ',': + *type = TKN_COMMA; + goto FINISH; + case '$': + *type = TKN_DOLLAR; + goto FINISH; + case '\0': + *type = TKN_EOF; + goto FINISH; + } + + *type = TKN_SINGLE; + +FINISH: + return current; +} + +static enum PREC tknprec(enum TKN type) +{ + switch (type) { + case TKN_LEFT_PAREN: + case TKN_LEFT_BRACKET: + return PREC_LAST; + case TKN_PLUS: + case TKN_STAR: + case TKN_QMARK: + return PREC_REPETITION; + case TKN_VERTBAR: + return PREC_ALTERNATION; + case TKN_ESCAPED: + case TKN_SINGLE: + return PREC_CONCAT; + default: + return PREC_NONE; + } +} + +static int next(const char **regex, + enum TKN *type, + uint32_t *lexsz, + const char **lex) +{ + const char *start = *regex; + const char *end = 0; + + end = tokenize(start, type); + + *regex = end; + + if (lexsz) + *lexsz = end - start; + + if (lex) + *lex = start; + + return *type == TKN_EOF ? RGX_ENDED : RGX_OK; +} + +static int char_emit(struct cmplr *p, char c) +{ + if (p->codesz + 2 >= UINT32_MAX) + return RGX_OOM; + + p->codes = sb_add(p->codebuf, 0, 2); + + if (p->codes == NULL) + goto OUT_OF_MEMORY; + + p->codes[p->codesz] = OP_CHAR; + p->codes[p->codesz + 1] = c; + p->lastop = p->codesz; + p->codesz += 2; + + return RGX_OK; + +OUT_OF_MEMORY: + return RGX_OOM; +} + +static int jmp16_emit(struct cmplr *p, size_t *loc) +{ + if (p->codesz + 3 >= UINT32_MAX) + goto OUT_OF_MEMORY; + + p->codes = sb_add(p->codebuf, 0, 3); + + if (p->codes == NULL) + goto OUT_OF_MEMORY; + + p->codes[p->codesz] = OP_JMP16; + p->lastop = p->codesz; + *loc = p->codesz + 1; + p->codesz += 3; + + return RGX_OK; + +OUT_OF_MEMORY: + return RGX_OOM; +} + +static int split8_emit(struct cmplr *p, size_t *left, size_t *right) +{ + if (p->codesz + 3 >= UINT32_MAX) + goto OUT_OF_MEMORY; + + p->codes = sb_add(p->codebuf, 0, 3); + + if (p->codes == NULL) + goto OUT_OF_MEMORY; + + p->codes[p->codesz] = OP_SPLIT8; + p->lastop = p->codesz; + *left = p->codesz + 1; + *right = p->codesz + 2; + p->codesz += 3; + + return RGX_OK; + +OUT_OF_MEMORY: + return RGX_OOM; +} + +static int split16_emit(struct cmplr *p, size_t *left, size_t *right) +{ + if (p->codesz + 5 >= UINT32_MAX) + goto OUT_OF_MEMORY; + + p->codes = sb_add(p->codebuf, 0, 5); + + if (p->codes == NULL) + goto OUT_OF_MEMORY; + + p->codes[p->codesz] = OP_SPLIT16; + p->lastop = p->codesz; + *left = p->codesz + 1; + *right = p->codesz + 3; + p->codesz += 5; + + return RGX_OK; + +OUT_OF_MEMORY: + return RGX_OOM; +} + +static int charset(rgxast_t *root, + struct sc_mem *alloc, + struct parser *p, + struct rgxast *asts) +{ + if (mkast(RGXAST_CHARSET, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + const char *current = p->regex; + + while (*current != '\0' && *current != ']') { + char c = *current; + enum RGXAST type = RGXAST_SINGLE; + + if (c == '\\') { + current += 1; + type = RGXAST_ESCAPE; + c = *current; + } + + rgxast_t child; + + if (mkast(type, c, asts, &child) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, child, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + current += 1; + } + + p->regex = current; + // consume ']' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (p->type != TKN_RIGHT_BRACKET) { + size_t ofs = current - p->base; + sc_strfmt(alloc, &p->errmsg, "col:%lu missing ]", ofs); + + if (p->errmsg == NULL) + goto OUT_OF_MEMORY; + + goto INVARIANT; + } + + return RGX_NEXT; + +OUT_OF_MEMORY: + p->errmsg = "oom"; +INVARIANT: + return RGX_INVARIANT; +} + +static inline int prefix(rgxast_t *root, + struct sc_mem *alloc, + struct parser *p, + struct rgxast *asts) +{ + switch (p->type) { + case TKN_EOF: + goto ENDED; + case TKN_LEFT_PAREN: { + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (expr(1, alloc, p, asts, root) == RGX_INVARIANT) + goto INVARIANT; + + if (p->type != TKN_RIGHT_PAREN) { + size_t ofs = p->regex - p->base; + const char fmt[] = "col:%lu missing )"; + sc_strfmt(alloc, &p->errmsg, fmt, ofs, *p->lex, p->lex); + goto INVARIANT; + } + + break; + } + case TKN_LEFT_BRACKET: + if (charset(root, alloc, p, asts) == RGX_INVARIANT) + goto INVARIANT; + + break; + case TKN_ESCAPED: + // consume '\' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (mkast(RGXAST_ESCAPE, *p->lex, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + case TKN_SINGLE: + if (mkast(RGXAST_SINGLE, *p->lex, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + default: { + size_t ofs = p->regex - p->base; + const char fmt[] = "col:%lu invalid prefix '%c': \"%s\""; + sc_strfmt(alloc, &p->errmsg, fmt, ofs, *p->lex, p->lex); + + if (p->errmsg == NULL) + goto OUT_OF_MEMORY; + + goto INVARIANT; + } + } + + return RGX_NEXT; + +OUT_OF_MEMORY: + p->errmsg = "oom"; + +INVARIANT: + return RGX_INVARIANT; +ENDED: + return RGX_ENDED; +} + +static inline int infix(rgxast_t *root, + struct sc_mem *alloc, + struct parser *p, + struct rgxast *asts) +{ + if (*root >= asts->size) { + assert(0); + p->errmsg = "what the frick?"; + goto INVARIANT; + }; + + rgxast_t left = *root; + + switch (p->type) { + case TKN_EOF: + goto ENDED; + case TKN_VERTBAR: { + rgxast_t right = 0; + + // consume '|' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (mkast(RGXAST_OR, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (expr(tknprec(p->type), alloc, p, asts, &right) != RGX_NEXT) + goto PANIC; + + if (addchild(*root, left, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, right, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + } + case TKN_STAR: { + // consume '*' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (mkast(RGXAST_STAR, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, left, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + } + case TKN_PLUS: { + // consume '+' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (mkast(RGXAST_PLUS, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, left, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + } + case TKN_QMARK: { + // consume '?' + next(&p->regex, &p->type, &p->lexsz, &p->lex); + + if (mkast(RGXAST_QMARK, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, left, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + } + default: { + rgxast_t right = 0; + + if (mkast(RGXAST_CONCAT, 0, asts, root) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (expr(PREC_CONCAT, alloc, p, asts, &right) != RGX_NEXT) + goto PANIC; + + if (addchild(*root, left, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + if (addchild(*root, right, asts) == RGX_OOM) + goto OUT_OF_MEMORY; + + break; + } + } + + return RGX_NEXT; + +OUT_OF_MEMORY: + p->errmsg = "oom"; +PANIC: +INVARIANT: + return RGX_INVARIANT; + +ENDED: + return RGX_ENDED; +} + +static inline int expr(enum PREC rbp, + struct sc_mem *alloc, + struct parser *p, + struct rgxast *asts, + rgxast_t *root) +{ + switch (prefix(root, alloc, p, asts)) { + case RGX_ENDED: + goto ENDED; + case RGX_NEXT: + break; + default: + goto PANIC; + } + + // consume prefix + next(&p->regex, &p->type, &p->lexsz, &p->lex); + enum PREC lbp = tknprec(p->type); + + while (rbp <= lbp) { + switch (infix(root, alloc, p, asts)) { + case RGX_ENDED: + goto ENDED; + case RGX_NEXT: + break; + default: + goto PANIC; + } + + lbp = tknprec(p->type); + } + + return RGX_NEXT; +ENDED: + return RGX_ENDED; +PANIC: + return RGX_INVARIANT; +} + +static void free_asts(struct rgxast *list) +{ + for (size_t i = 0; i < list->size; ++i) + jry_free(list->child[i]); + + jry_free(list->c); + jry_free(list->child); + jry_free(list->childsz); + jry_free(list->type); + + list->c = NULL; + list->child = NULL; + list->childsz = NULL; + list->type = NULL; + list->size = 0; +} + +int rgx_parse(struct sc_mem *alloc, + const char *pattern, + struct rgxast *list, + const char **errmsg) +{ + struct parser p = { + .base = pattern, + .regex = pattern, + }; + + if (sc_reap(alloc, list, (free_t) free_asts)) + goto OUT_OF_MEMORY; + + rgxast_t root; + + if (mkast(RGXAST_ROOT, 0, list, &root) == RGX_OOM) + goto OUT_OF_MEMORY; + + rgxast_t child; + + next(&p.regex, &p.type, &p.lexsz, &p.lex); + + switch (expr(0, alloc, &p, list, &child)) { + case RGX_INVARIANT: + goto INVARIANT; + case RGX_ENDED: + case RGX_NEXT: + break; + default: + assert(0); + p.errmsg = "what the frick"; + goto INVARIANT; + } + + if (addchild(root, child, list) == RGX_OOM) + goto OUT_OF_MEMORY; + + return RGX_OK; + +OUT_OF_MEMORY: + p.errmsg = "oom"; + +INVARIANT: + if (errmsg) + *errmsg = p.errmsg; + + return RGX_ERROR; +} diff --git a/lib/jay/regex.h b/lib/jay/regex.h new file mode 100644 index 0000000..6a29d60 --- /dev/null +++ b/lib/jay/regex.h @@ -0,0 +1,35 @@ +#include "jary/memory.h" + +#include + +typedef uint16_t rgxast_t; + +enum { + RGX_OK = 0, + RGX_ERROR = 4, +}; + +enum RGXAST { + RGXAST_ROOT, + RGXAST_ESCAPE, + RGXAST_SINGLE, + RGXAST_CHARSET, + RGXAST_OR, + RGXAST_STAR, + RGXAST_PLUS, + RGXAST_QMARK, + RGXAST_CONCAT, +}; + +struct rgxast { + enum RGXAST *type; + char *c; + rgxast_t **child; + rgxast_t *childsz; + rgxast_t size; +}; + +int rgx_parse(struct sc_mem *alloc, + const char *pattern, + struct rgxast *list, + const char **errmsg); diff --git a/lib/jay/token.h b/lib/jay/token.h index 5165c7a..6ef35c2 100644 --- a/lib/jay/token.h +++ b/lib/jay/token.h @@ -87,8 +87,8 @@ enum jy_tkn { TKN_TILDE, TKN_CONCAT, - TKN_PLUS, TKN_MINUS, + TKN_PLUS, TKN_STAR, TKN_SLASH, TKN_COMMENT, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e80958d..eb0b404 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,6 +50,30 @@ add_executable( compiler_test compiler_test.cc ) add_executable( exec_test exec_test.cc ) add_executable( jary_test jary_test.cc ) +target_include_directories( scanner_test + PRIVATE + ${CMAKE_SOURCE_DIR}/lib/ +) + +target_include_directories( parser_test + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/lib/ +) + +target_include_directories( compiler_test + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/lib/ +) + +target_include_directories( exec_test + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/lib/ +) + + target_compile_definitions( compiler_test PUBLIC BASIC_JARY_PATH="$/compiler_basic.jary" diff --git a/test/compiler_test.cc b/test/compiler_test.cc index daf8dbb..cca5eb5 100644 --- a/test/compiler_test.cc +++ b/test/compiler_test.cc @@ -32,11 +32,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include extern "C" { -#include "ast.h" -#include "compiler.h" -#include "error.h" -#include "parser.h" -#include "token.h" +#include "jay/ast.h" +#include "jay/compiler.h" +#include "jay/error.h" +#include "jay/parser.h" +#include "jay/token.h" #include "jary/memory.h" } diff --git a/test/exec_test.cc b/test/exec_test.cc index e747a9e..4819fd3 100644 --- a/test/exec_test.cc +++ b/test/exec_test.cc @@ -32,12 +32,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include extern "C" { -#include "ast.h" -#include "compiler.h" -#include "error.h" -#include "exec.h" -#include "parser.h" -#include "token.h" +#include "jay/ast.h" +#include "jay/compiler.h" +#include "jay/error.h" +#include "jay/exec.h" +#include "jay/parser.h" +#include "jay/token.h" #include "jary/defs.h" #include "jary/memory.h" diff --git a/test/parser_test.cc b/test/parser_test.cc index 1d9df54..91ed41b 100644 --- a/test/parser_test.cc +++ b/test/parser_test.cc @@ -32,10 +32,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include extern "C" { -#include "ast.h" -#include "error.h" -#include "parser.h" -#include "token.h" +#include "jay/ast.h" +#include "jay/error.h" +#include "jay/parser.h" +#include "jay/token.h" #include "jary/memory.h" } diff --git a/test/scanner_test.cc b/test/scanner_test.cc index c6c7b41..d0da887 100644 --- a/test/scanner_test.cc +++ b/test/scanner_test.cc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include extern "C" { -#include "scanner.h" +#include "jay/scanner.h" } TEST(ScannerTest, ScanEOF) diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt index aeaf612..9c1dbf0 100644 --- a/tool/CMakeLists.txt +++ b/tool/CMakeLists.txt @@ -28,17 +28,39 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add_executable( jassy ) +add_executable( regdmp ) -target_sources( jassy PRIVATE jassy/jassy.c ) +target_sources( jassy PRIVATE jassy.c ) +target_sources( regdmp + PRIVATE + regdmp.c + + ${CMAKE_SOURCE_DIR}/lib/jay/memory.c +) + +target_include_directories( regdmp + PRIVATE + ${CMAKE_SOURCE_DIR}/lib/ + ${CMAKE_SOURCE_DIR}/include +) + +target_include_directories( jassy + PRIVATE + ${CMAKE_SOURCE_DIR}/lib/ + ${CMAKE_SOURCE_DIR}/include +) target_link_libraries( jassy PRIVATE compiler ) if ( SCRUTINY AND CMAKE_C_COMPILER_ID STREQUAL "GNU" ) target_compile_options( jassy BEFORE PRIVATE -fanalyzer ) + target_compile_options( regdmp BEFORE PRIVATE -fanalyzer ) endif() if ( SANITIZE AND UNIX AND CMAKE_C_COMPILER_ID MATCHES "^(Clang|GNU)$" ) target_link_options( jassy BEFORE PRIVATE -fsanitize=address -fno-omit-frame-pointer ) + target_link_options( regdmp BEFORE PRIVATE -fsanitize=address -fno-omit-frame-pointer ) endif() set_target_properties( jassy PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/ ) +set_target_properties( regdmp PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/ ) diff --git a/tool/jassy/jassy.c b/tool/jassy.c similarity index 99% rename from tool/jassy/jassy.c rename to tool/jassy.c index 498ef76..26bb02e 100644 --- a/tool/jassy/jassy.c +++ b/tool/jassy.c @@ -29,12 +29,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "ast.h" -#include "compiler.h" -#include "dload.h" -#include "error.h" -#include "parser.h" -#include "token.h" +#include "jay/ast.h" +#include "jay/compiler.h" +#include "jay/dload.h" +#include "jay/error.h" +#include "jay/parser.h" +#include "jay/token.h" #include "jary/defs.h" #include "jary/memory.h" @@ -999,7 +999,6 @@ static void run_file(const char *path, const char *dirpath) printf("\n"); if (errs.size) { - /*int sz = print_errors(&errs, &tkns, path);*/ int sz = prerrors(0, NULL, &errs, &tkns, path); char buf[sz]; diff --git a/tool/regdmp.c b/tool/regdmp.c new file mode 100644 index 0000000..f3186c7 --- /dev/null +++ b/tool/regdmp.c @@ -0,0 +1,321 @@ +#include "jay/regex.c" + +#include "jary/memory.h" + +#include + +static inline char *tknstr(enum TKN type) +{ + switch (type) { + case TKN_BACKSLASH: + return "BACKSLASH"; + case TKN_CARET: + return "CARET"; + case TKN_SINGLE: + return "SINGLE"; + case TKN_LEFT_BRACKET: + return "LEFT_BRACKET"; + case TKN_RIGHT_BRACKET: + return "RIGHT_BRACKET"; + case TKN_LEFT_PAREN: + return "LEFT_PAREN"; + case TKN_RIGHT_PAREN: + return "RIGHT_PAREN"; + case TKN_LEFT_BRACE: + return "LEFT_BRACE"; + case TKN_RIGHT_BRACE: + return "RIGHT_BRACE"; + case TKN_VERTBAR: + return "VERTBAR"; + case TKN_QMARK: + return "QMARK"; + case TKN_DOT: + return "DOT"; + case TKN_PLUS: + return "PLUS"; + case TKN_STAR: + return "STAR"; + case TKN_DOLLAR: + return "DOLLAR"; + case TKN_COMMA: + return "COMMA"; + case TKN_ESCAPED: + return "ESCAPED"; + case TKN_EOF: + return "EOF"; + } +} + +static void pr_tknlist(uint32_t size, + uint32_t *offsets, + uint16_t *lexemesz, + const char *const *lexemes, + const enum TKN *const types) +{ + uint32_t maxtypesz = 0; + + for (uint32_t i = 0; i < size; ++i) { + enum TKN t = types[i]; + const char *ts = tknstr(t); + uint32_t len = strlen(ts); + maxtypesz = len > maxtypesz ? len : maxtypesz; + } + + printf("Offset Type %*cLexeme\n", maxtypesz - 4, ' '); + + for (uint32_t i = 0; i < size; ++i) { + enum TKN type = types[i]; + const char *lex = lexemes[i]; + int lexsz = lexemesz[i]; + const char *typstr = tknstr(type); + + printf("%5u %s", offsets[i], typstr); + + uint32_t n = strlen(typstr); + + if (n != maxtypesz) + printf("%*c", maxtypesz - n, ' '); + + printf(" "); + printf("%.*s", lexsz, lex); + printf("\n"); + } +} + +static inline size_t findmaxdepth(rgxast_t id, + const struct rgxast *asts, + size_t depth) +{ + rgxast_t *child = asts->child[id]; + rgxast_t childsz = asts->childsz[id]; + size_t maxdepth = depth; + + for (uint32_t i = 0; i < childsz; ++i) { + size_t tmp = findmaxdepth(child[i], asts, depth + 1); + maxdepth = (tmp > maxdepth) ? tmp : maxdepth; + } + + return maxdepth; +} + +static inline void pr_ast(rgxast_t id, + const struct rgxast *asts, + uint32_t midpoint, + uint32_t numsz, + uint32_t depth) +{ + enum RGXAST type = asts->type[id]; + rgxast_t *child = asts->child[id]; + rgxast_t childsz = asts->childsz[id]; + uint32_t printed = 0; + + const char *typestr = "UNKNOWN"; + + switch (type) { + case RGXAST_ESCAPE: + typestr = "ESCAPE"; + break; + case RGXAST_SINGLE: + typestr = "SINGLE"; + break; + case RGXAST_OR: + typestr = "OR"; + break; + case RGXAST_CHARSET: + typestr = "CHARSET"; + break; + case RGXAST_ROOT: + typestr = "ROOT"; + break; + case RGXAST_PLUS: + typestr = "PLUS"; + break; + case RGXAST_QMARK: + typestr = "QMARK"; + break; + case RGXAST_STAR: + typestr = "STAR"; + break; + case RGXAST_CONCAT: + typestr = "CONCAT"; + break; + } + + if (type != RGXAST_ROOT) { + printed += printf("|"); + uint32_t lnsz = depth * 2 + 1; + + char depthline[lnsz]; + memset(depthline, '_', lnsz); + depthline[lnsz - 1] = '\0'; + printed += printf("%s ", depthline); + } + + printed += printf("%s ", typestr); + int diff = midpoint - printed; + + if (diff > 0) { + char dots[diff + 1]; + memset(dots, '.', diff); + dots[diff] = '\0'; + printed += printf("%s |", dots); + } + + printf(" ["); + printed = printf("%d", id); + printf("] "); + diff = numsz - printed; + + if (diff > 0) + printf("%*c", diff, ' '); + + printf("| "); + + printf("%c", asts->c[id]); + + printf("\n"); + + for (uint32_t i = 0; i < childsz; ++i) + pr_ast(child[i], asts, midpoint, numsz, depth + 1); +} + +static inline void pr_astlist(const struct rgxast *asts) +{ + size_t depth = findmaxdepth(0, asts, 0); + size_t midpoint = 2 * depth + 20; + int col1sz = midpoint - 4; + int idsz = snprintf(NULL, 0, "%d", asts->size); + + printf("Tree "); + printf("%*c ", col1sz, ' '); + printf(" ID "); + printf(" %*c", idsz + 1, ' '); + printf("Token\n"); + + if (asts->size) + pr_ast(0, asts, midpoint, idsz, 0); +} + +static void prcodes(long size, uint8_t *codes) +{ +#define OFS() pc.u8 - codes + union codeview pc = { .u8 = codes }; + for (; OFS() < size;) { + printf("%5ld | ", OFS()); + enum OPCODE opcode = *pc.u8; + pc.u8 += 1; + + switch (opcode) { + case OP_CHAR: + printf("CHAR %c", pc.u8[0]); + pc.u8 += 1; + break; + case OP_JMP16: + printf("JMP16 %d", pc.i16[0]); + pc.u16 += 1; + break; + case OP_SPLIT8: + printf("SPLIT8 %d %d", pc.i8[0], pc.i8[1]); + pc.i8 += 2; + break; + case OP_SPLIT16: + printf("SPLIT16 %d %d", pc.i16[0], pc.i16[1]); + pc.i16 += 2; + break; + } + + printf("\n"); + } + +#undef OFS +} + +int main(int argc, const char **argv) +{ + struct sc_mem alloc = { .buf = NULL }; + + if (argc != 2) { + fprintf(stderr, "usage: regex "); + goto FINISH; + } + + struct { + struct sb_mem *lexbuf; + struct sb_mem *lexszbuf; + struct sb_mem *typebuf; + struct sb_mem *ofsbuf; + const char **lex; + uint16_t *lexsz; + enum TKN *types; + uint32_t *ofs; + uint32_t size; + } tkns = { + .lexbuf = sc_linbuf(&alloc), + .lexszbuf = sc_linbuf(&alloc), + .typebuf = sc_linbuf(&alloc), + .ofsbuf = sc_linbuf(&alloc), + }; + + const char *pattern = argv[1]; + struct parser C1 = { .regex = pattern }; + + while (next(&C1.regex, &C1.type, &C1.lexsz, &C1.lex) == RGX_OK) { + tkns.lex = sb_add(tkns.lexbuf, 0, sizeof(*tkns.lex)); + tkns.lexsz = sb_add(tkns.lexszbuf, 0, sizeof(*tkns.lexsz)); + tkns.types = sb_add(tkns.typebuf, 0, sizeof(*tkns.types)); + tkns.ofs = sb_add(tkns.ofsbuf, 0, sizeof(*tkns.ofs)); + + tkns.lex[tkns.size] = C1.lex; + tkns.lexsz[tkns.size] = C1.lexsz; + tkns.types[tkns.size] = C1.type; + tkns.ofs[tkns.size] = C1.regex - pattern - C1.lexsz; + tkns.size += 1; + } + + printf("====================================" + "\n" + "| |" + "\n" + "| Regex Dump ! |" + "\n" + "| |" + "\n" + "====================================" + "\n\n"); + + printf("Tokens" + "\n" + "====================================" + "\n\n"); + + if (tkns.size) { + pr_tknlist(tkns.size, tkns.ofs, tkns.lexsz, tkns.lex, + tkns.types); + printf("\n"); + } + + printf("Abstract Syntax Tree" + "\n" + "====================================" + "\n\n"); + + struct rgxast asts = { .size = 0 }; + const char *errmsg = NULL; + + rgx_parse(&alloc, pattern, &asts, &errmsg); + pr_astlist(&asts); + + if (errmsg != NULL) + printf("\nerror: %s", errmsg); + + printf("\n\n"); + + printf("VM Bytecode" + "\n" + "====================================" + "\n\n"); + +FINISH: + sc_free(&alloc); + return 0; +}