From 0a57b6e7272ac0edb5905b677cfc4e30b6b8f8e9 Mon Sep 17 00:00:00 2001 From: Gary Gan Date: Thu, 5 Jun 2025 21:29:46 +0800 Subject: [PATCH] =?UTF-8?q?parser=E6=9C=AA=E5=AE=8C=E5=85=A8=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/ast.hpp | 119 ++++++++++++++++++ include/syntax/Parser.hpp | 249 ++++++++++++++++++++++++++++++++++++++ include/syntax/Scanner.h | 10 +- include/syntax/token.h | 11 +- include/types.hpp | 49 ++++++++ src/ast.cpp | 2 + 6 files changed, 435 insertions(+), 5 deletions(-) create mode 100644 include/ast.hpp create mode 100644 include/syntax/Parser.hpp create mode 100644 src/ast.cpp diff --git a/include/ast.hpp b/include/ast.hpp new file mode 100644 index 0000000..e8c47cb --- /dev/null +++ b/include/ast.hpp @@ -0,0 +1,119 @@ +#pragma once +#include +#include + +enum ast_type_t { + AST_EXPR_LITERAL = 1, // 常数值 => 预计将存储在 data 段中 + AST_EXPR_BINARY, // 二元表达式 + AST_EXPR_UNARY, // 一元表达式 + AST_EXPR_IDENT, // 标识符表达式 + + AST_EXPR_MAP_ACCESS, + AST_EXPR_VEC_ACCESS, + AST_EXPR_ARRAY_ACCESS, + AST_EXPR_TUPLE_ACCESS, + AST_EXPR_STRUCT_SELECT, + + AST_EXPR_VEC_NEW, // [1, 2, 3] + AST_EXPR_ARRAY_NEW, // [1, 2, 3] + AST_EXPR_VEC_REPEAT_NEW, // [0;12] + AST_EXPR_ARRAY_REPEAT_NEW, // [0;12] + AST_EXPR_EMPTY_CURLY_NEW, // {} + AST_EXPR_MAP_NEW, // {"a": 1, "b": 2} + AST_EXPR_TUPLE_NEW, // (1, 1.1, true) + AST_EXPR_TUPLE_DESTR, // (var_a, var_b, (var_c, var_d)) + AST_EXPR_STRUCT_NEW, // person {a = 1; b = 2} + AST_EXPR_BLOCK, + + // stmt + AST_STMT_EXPR_FAKE, + AST_STMT_BREAK, + AST_STMT_CONTINUE, + AST_STMT_IMPORT, + AST_STMT_VARDEF, + AST_STMT_VAR_TUPLE_DESTR, + AST_STMT_ASSIGN, + AST_STMT_GLOBAL_ASSIGN, + AST_STMT_RETURN, + AST_STMT_IF, + AST_STMT_FOR_ITERATOR, + AST_STMT_FOR_COND, + AST_STMT_FOR_TRADITION, + AST_CALL, + AST_FNDEF, // fn def (其包含 body) + AST_STMT_ENV_CLOSURE, // closure def +}; + +enum ast_expr_op_t { + // ARITHMETIC 运算 + AST_OP_ADD, // + + AST_OP_SUB, // - + AST_OP_MUL, // * + AST_OP_DIV, // / + AST_OP_REM, // % + + // unary + AST_OP_NOT, // unary bool !right, right must bool + AST_OP_NEG, // unary number -right + AST_OP_BNOT, // unary binary ~right, right must int + + // 位运算 + AST_OP_AND, + AST_OP_OR, + AST_OP_XOR, + AST_OP_LSHIFT, + AST_OP_RSHIFT, + + AST_OP_LT, // < + AST_OP_LE, // <= + AST_OP_GT, // > + AST_OP_GE, // >= + AST_OP_EE, // == + AST_OP_NE, // != + + AST_OP_AND_AND, // && + AST_OP_OR_OR, // || +}; + +// 将ast_expr_op_t类型转换为string +inline std::unordered_map ast_type_map = { + // 算术运算符 + {AST_OP_ADD, "+"}, + {AST_OP_SUB, "-"}, + {AST_OP_MUL, "*"}, + {AST_OP_DIV, "/"}, + {AST_OP_REM, "%"}, + + // 位运算符 + {AST_OP_AND, "&"}, + {AST_OP_OR, "|"}, + {AST_OP_XOR, "^"}, + {AST_OP_BNOT, "~"}, + {AST_OP_LSHIFT, "<<"}, + {AST_OP_RSHIFT, ">>"}, + + // 比较运算符 + {AST_OP_LT, "<"}, + {AST_OP_LE, "<="}, + {AST_OP_GT, ">"}, + {AST_OP_GE, ">="}, + {AST_OP_EE, "=="}, + {AST_OP_NE, "!="}, + + // 逻辑运算符 + {AST_OP_OR_OR, "||"}, + {AST_OP_AND_AND, "&&"}, + {AST_OP_NOT, "!"}, // 一元运算符:逻辑非 + {AST_OP_NEG, "-"}, // 一元运算符:负号 +}; + +struct ast_stmt_t { + int line; + int column; + bool error; + + ast_type_t assert_type; // 声明语句类型 + void *value; +}; + + diff --git a/include/syntax/Parser.hpp b/include/syntax/Parser.hpp new file mode 100644 index 0000000..3336e90 --- /dev/null +++ b/include/syntax/Parser.hpp @@ -0,0 +1,249 @@ +#pragma once + +#include "../ast.hpp" +#include "../types.hpp" +#include "token.h" + +class Parser { +public: + Parser(module_t module) + : m(module) {}; + +private: + module_t m; + + // 定义优先级的枚举类型 + enum precedence { + PRECEDENCE_NULL,// 最低优先级 + PRECEDENCE_ASSIGN, + PRECEDENCE_CATCH, + PRECEDENCE_OR_OR, // || + PRECEDENCE_AND_AND, // && + PRECEDENCE_OR, // | + PRECEDENCE_XOR, // ^ + PRECEDENCE_AND, // % + PRECEDENCE_CMP_EQUAL,// == != + PRECEDENCE_COMPARE, // > < >= <= + PRECEDENCE_SHIFT, // << >> + PRECEDENCE_TERM, // + - + PRECEDENCE_FACTOR, // * / % + PRECEDENCE_UNARY, // - / ! / ~ / * / & + PRECEDENCE_CALL, // foo.bar foo["bar"] foo() foo().foo.bar 这几个表达式都是同一优先级,应该从左往右依次运算 + PRECEDENCE_PRIMARY, // 最高优先级 + }; + + inline std::unordered_map token_to_ast_op = { + // 算术运算符 + {TOKEN_PLUS, AST_OP_ADD}, // + + {TOKEN_MINUS, AST_OP_SUB}, // - + {TOKEN_STAR, AST_OP_MUL}, // * + {TOKEN_SLASH, AST_OP_DIV}, // / + {TOKEN_PERCENT, AST_OP_REM}, // % + + // 比较运算符 + {TOKEN_EQUAL_EQUAL, AST_OP_EE}, // == + {TOKEN_NOT_EQUAL, AST_OP_NE}, // != + {TOKEN_GREATER_EQUAL, AST_OP_GE},// >= + {TOKEN_RIGHT_ANGLE, AST_OP_GT}, // > + {TOKEN_LESS_EQUAL, AST_OP_LE}, // <= + {TOKEN_LESS_THAN, AST_OP_LT}, // < + + // 逻辑运算符 + {TOKEN_AND_AND, AST_OP_AND_AND}, // && + {TOKEN_OR_OR, AST_OP_OR_OR}, // || + + // 位运算符 + {TOKEN_TILDE, AST_OP_BNOT}, // ~ + {TOKEN_AND, AST_OP_AND}, // & + {TOKEN_OR, AST_OP_OR}, // | + {TOKEN_XOR, AST_OP_XOR}, // ^ + {TOKEN_LEFT_SHIFT, AST_OP_LSHIFT}, // << + {TOKEN_RIGHT_SHIFT, AST_OP_RSHIFT}, // >> + + // 复合赋值运算符(拆解为基本运算) + {TOKEN_PERCENT_EQUAL, AST_OP_REM}, // %= + {TOKEN_MINUS_EQUAL, AST_OP_SUB}, // -= + {TOKEN_PLUS_EQUAL, AST_OP_ADD}, // += + {TOKEN_SLASH_EQUAL, AST_OP_DIV}, // /= + {TOKEN_STAR_EQUAL, AST_OP_MUL}, // *= + {TOKEN_OR_EQUAL, AST_OP_OR}, // |= + {TOKEN_AND_EQUAL, AST_OP_AND}, // &= + {TOKEN_XOR_EQUAL, AST_OP_XOR}, // ^= + {TOKEN_LEFT_SHIFT_EQUAL, AST_OP_LSHIFT}, // <<= + {TOKEN_RIGHT_SHIFT_EQUAL, AST_OP_RSHIFT}, // >>= + }; + + inline std::unordered_map token_to_type = { + // 字面量类型 + {TOKEN_TRUE, TYPE_BOOL}, // true + {TOKEN_FALSE, TYPE_BOOL}, // false + {TOKEN_VOID, TYPE_VOID}, // void + {TOKEN_LITERAL_FLOAT, TYPE_FLOAT64},// 浮点字面量 + {TOKEN_LITERAL_INT, TYPE_INT32}, // 整数字面量 + {TOKEN_LITERAL_STRING, TYPE_STRING}, // 字符串字面量 + + // 基本数据类型 + {TOKEN_BOOL, TYPE_BOOL}, // bool + {TOKEN_STRING, TYPE_STRING}, // string + + // 定长整数类型 + {TOKEN_I8, TYPE_INT8}, // int8 + {TOKEN_I16, TYPE_INT16}, // int16 + {TOKEN_I32, TYPE_INT32}, // int32 + {TOKEN_I64, TYPE_INT64}, // int64 + {TOKEN_U8, TYPE_UINT8}, // uint8 + {TOKEN_U16, TYPE_UINT16}, // uint16 + {TOKEN_U32, TYPE_UINT32}, // uint32 + {TOKEN_U64, TYPE_UINT64}, // uint64 + {TOKEN_F32, TYPE_FLOAT32}, // float32 + {TOKEN_F64, TYPE_FLOAT64}, // float64 + + // 特殊类型 + {TOKEN_VAR, TYPE_UNKNOWN}, // var (未推导类型) + + // 复合数据类型 + {TOKEN_VEC, TYPE_VEC}, // vector + {TOKEN_MAP, TYPE_MAP}, // map + }; + + // 返回当前token,token指针向前进一位 + inline token_t advance() { + if (m.parser_cursor.current + 1 >= m.token_list.size()) { + std::cerr << "[ERROR] line: " << m.token_list[m.parser_cursor.current].line << ". column: " + << m.token_list[m.parser_cursor.current].column <<". next token is null"; + } + token_t token = m.token_list[m.parser_cursor.current]; + m.parser_cursor.current++; + return token; + } + + inline token_t retreate() { + if (m.parser_cursor.current - 1 < 0) { + std::cerr << "[ERROR] line: " << m.token_list[m.parser_cursor.current].line << ". column: " + << m.token_list[m.parser_cursor.current].column <<". prev token is null"; + } + token_t token = m.token_list[m.parser_cursor.current]; + m.parser_cursor.current--; + return token; + } + + inline token_t peak() { + return m.token_list[m.parser_cursor.current]; + } + + inline token_t prev() { + if (m.parser_cursor.current - 1 < 0) { + std::cerr << "[ERROR] line: " << m.token_list[m.parser_cursor.current].line << ". column: " + << m.token_list[m.parser_cursor.current].column <<". prev token is null"; + } + token_t token = m.token_list[m.parser_cursor.current - 1]; + return token; + } + + inline bool is(token_type_t expect) { + return m.token_list[m.parser_cursor.current].type == expect; + } + + inline bool ident_is(std::string expect) { + return m.token_list[m.parser_cursor.current].literal == expect; + } + + inline bool is_literal() { + return is(TOKEN_LITERAL_FLOAT) || + is(TOKEN_LITERAL_INT) || + is(TOKEN_LITERAL_STRING); + } + + inline token_t consume(token_type_t expect) { + token_t t = m.token_list[m.parser_cursor.current]; + if (t.type == expect) { + advance(); + return true; + } + return false; + } + + inline token_t must(token_type_t expect) { + token_t t = m.token_list[m.parser_cursor.current]; + if (t.type != expect) { + std::cerr << "[ERROR] line: " << m.token_list[m.parser_cursor.current].line << ". column: " + << m.token_list[m.parser_cursor.current].column <<". expected " << token_str[expect] + << " but got " << t.literal; + } + + advance(); + return t; + } + + inline token_t next(int step) { + if (m.parser_cursor.current + step >= m.token_list.size()) { + return nullptr; + } + return m.token_list[m.parser_cursor.current + step]; + } + + inline bool next_is(int step, token_type_t expect) { + if (m.parser_cursor.current + step >= m.token_list.size()) { + return false; + } + return m.token_list[m.parser_cursor.current + step].type == expect; + } + + inline bool parser_must_stmt_end() { + if (is(TOKEN_EOF) || is(TOKEN_RIGHT_CURLY)) { + return true; + } + + token_t t = prev(); + std::cerr << "[ERROR] line: " << t.line << ". column: " + << t.column<<". excepted ';' or '}' at end of statement"; + return false; + } + + inline bool is_basic_type() { + if (is(TOKEN_VAR) || is(TOKEN_VOID) || + is(TOKEN_I8) || is(TOKEN_I16) || + is(TOKEN_I32) || is(TOKEN_I64) || + is(TOKEN_U8) || is(TOKEN_U16) || is(TOKEN_U32) || + is(TOKEN_U64)|| is(TOKEN_F32) || is(TOKEN_F64) || + is(TOKEN_BOOL) || is(TOKEN_STRING)) { + return true; + } + return false; + } + + /** + * var a = xxx + * int a = xxx + * bool a = xxx + * string a = xxx + * @return + */ + inline bool is_type_begin_stmt() { + // var/any/int/float/bool/string + if (is_basic_type()) { + return true; + } + return true; + } + + inline ast_stmt_t global_stmt() { + if (is_type_begin_stmt()) { + + } + } + + inline std::vector parser() { + // parser_cursor初始化已经在Scanner中初始化了 + std::vector ret; + + while (!is(TOKEN_EOF)) { + ret.push_back(parser_global_stmt()); + parser_must_stmt_end(); + } + + return ret; + }; + + +}; \ No newline at end of file diff --git a/include/syntax/Scanner.h b/include/syntax/Scanner.h index e369c46..29760e5 100644 --- a/include/syntax/Scanner.h +++ b/include/syntax/Scanner.h @@ -27,6 +27,10 @@ public: } tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column)); + + // 词法分析后实例化语法分析相关变量 + module.parser_cursor.tokens = tokens; + module.parser_cursor.current = 0; return tokens; } @@ -171,7 +175,7 @@ private: case '?': return TOKEN_QUESTION; case '%': - return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON; + return match('=') ? TOKEN_PERCENT_EQUAL : TOKEN_PERCENT; case '-': if (match('=')) { return TOKEN_MINUS_EQUAL; @@ -528,6 +532,10 @@ private: if (word.substr(2, 1) == "c" && word.size() == 3) { return TOKEN_VEC; } + case 'o': // void + if (word.substr(2,2) == "id" && word.size() == 4) { + return TOKEN_VOID; + } } } case 'u': { diff --git a/include/syntax/token.h b/include/syntax/token.h index c0b1561..01a11cf 100644 --- a/include/syntax/token.h +++ b/include/syntax/token.h @@ -25,7 +25,7 @@ enum token_type_t { TOKEN_SEMICOLON, // ; TOKEN_SLASH, // / TOKEN_STAR, // a * b, *a - TOKEN_PERSON, // % + TOKEN_PERCENT, // % TOKEN_QUESTION, // ? TOKEN_RIGHT_ARROW,// -> @@ -42,7 +42,7 @@ enum token_type_t { TOKEN_MINUS_EQUAL, // -= TOKEN_STAR_EQUAL, // *= TOKEN_SLASH_EQUAL, // /= - TOKEN_PERSON_EQUAL, // %= + TOKEN_PERCENT_EQUAL, // %= TOKEN_AND_EQUAL, // &= TOKEN_OR_EQUAL, // |= TOKEN_XOR_EQUAL, // ^= @@ -88,6 +88,8 @@ enum token_type_t { TOKEN_TRUE, TOKEN_FALSE, TOKEN_TYPE, + // 要加上void,函数没有返回时要用到 + TOKEN_VOID, TOKEN_STRUCT, TOKEN_CONTINUE, TOKEN_BREAK, @@ -120,7 +122,7 @@ inline static std::unordered_map token_str = { {TOKEN_SEMICOLON, ";"}, {TOKEN_SLASH, "/"}, {TOKEN_STAR, "*"}, - {TOKEN_PERSON, "%"}, + {TOKEN_PERCENT, "%"}, {TOKEN_QUESTION, "?"}, {TOKEN_RIGHT_ARROW, "->"}, {TOKEN_NOT, "!"}, @@ -135,7 +137,7 @@ inline static std::unordered_map token_str = { {TOKEN_MINUS_EQUAL, "-="}, {TOKEN_STAR_EQUAL, "*="}, {TOKEN_SLASH_EQUAL, "/="}, - {TOKEN_PERSON_EQUAL, "%="}, + {TOKEN_PERCENT_EQUAL, "%="}, {TOKEN_AND_EQUAL, "&="}, {TOKEN_OR_EQUAL, "|="}, {TOKEN_XOR_EQUAL, "^="}, @@ -176,6 +178,7 @@ inline static std::unordered_map token_str = { {TOKEN_BREAK, "break"}, {TOKEN_FOR, "for"}, {TOKEN_IN, "in"}, + {TOKEN_VOID, "void"}, {TOKEN_IF, "if"}, {TOKEN_ELSE, "else"}, {TOKEN_ELSE_IF, "else if"}, diff --git a/include/types.hpp b/include/types.hpp index e85d6fa..5bcbdc3 100644 --- a/include/types.hpp +++ b/include/types.hpp @@ -3,6 +3,46 @@ #include #include "syntax/token.h" +enum type_kind { + TYPE_BOOL = 1, + TYPE_INT8, + TYPE_UINT8, // uint8 ~ int 的顺序不可变,用于隐式类型转换 + TYPE_INT16, + TYPE_UINT16, + TYPE_INT32, + TYPE_UINT32, // value=10 + TYPE_INT64, + TYPE_UINT64, + + TYPE_FLOAT32, + TYPE_FLOAT64, // value = 5 + + // 复合类型 + TYPE_STRING, + TYPE_VEC, + TYPE_MAP, // value = 20 + TYPE_TUPLE, + TYPE_STRUCT, + TYPE_FN, // 具体的 fn 类型 + + + TYPE_FN_T, // 底层类型 + TYPE_INTEGER_T, // 底层类型 + TYPE_FLOATER_T, // 底层类型 + TYPE_ALL_T, // 通配所有类型 + + TYPE_VOID, // 表示函数无返回值 + TYPE_UNKNOWN, // var a = 1, a 的类型就是 unknown + TYPE_RAW_STRING, // c 语言中的 string, 目前主要用于 lir 中的 string imm + + + // TYPE_ALIAS, // 声明一个新的类型时注册的 type 的类型是这个 + // TYPE_PARAM, // type formal param type foo = f1|f2, 其中 f1 就是一个 param + TYPE_IDENT, + +}; + +// 语法分析扫描器指证 struct scanner_cursor_t { std::string source; std::string::size_type current; @@ -16,12 +56,21 @@ struct scanner_cursor_t { char space_next; }; +struct parser_cursor_t { + std::vector tokens; + std::string::size_type current; +}; + struct module_t { std::string source; + // 在语法分析中需要用到的变量 scanner_cursor_t s_cursor; std::vector token_list; + // 在语义分析中需要用到的变量 + parser_cursor_t parser_cursor; + module_t(std::string source) : source(source) { s_cursor.source = source; diff --git a/src/ast.cpp b/src/ast.cpp new file mode 100644 index 0000000..0dc5800 --- /dev/null +++ b/src/ast.cpp @@ -0,0 +1,2 @@ +#include "ast.hpp" +