From 0a3bfc4a0810c9255e382d11e9d7840f7853a700 Mon Sep 17 00:00:00 2001 From: Gary Gan Date: Wed, 4 Jun 2025 20:48:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=A7=E8=87=B4=E5=AE=9E=E7=8E=B0=E4=BA=86?= =?UTF-8?q?=E8=AF=8D=E6=B3=95=E5=88=86=E6=9E=90=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/syntax/Scanner.h | 321 ++++++++++++++++++++++++++++----------- include/types.hpp | 6 + unit/scanner_test.cpp | 29 +--- 3 files changed, 243 insertions(+), 113 deletions(-) diff --git a/include/syntax/Scanner.h b/include/syntax/Scanner.h index 8b9e9f8..4e80214 100644 --- a/include/syntax/Scanner.h +++ b/include/syntax/Scanner.h @@ -12,6 +12,24 @@ public: Scanner(module_t& module) : module(module) {} + inline std::vector scan() { + std::vector tokens; + + while (!at_eof()) { + + if (skip_space()) { + // 如果是空格或换行,则跳过 + continue; + } + + token_t token = item(); + tokens.push_back(token); + } + + tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column)); + return tokens; + } + private: module_t& module; @@ -90,7 +108,7 @@ private: } inline bool at_eof() { - return module.s_cursor.guard == '\0'; + return module.s_cursor.source[module.s_cursor.guard] == '\0'; } inline char guard_advance() { @@ -127,7 +145,7 @@ private: return gen_word(); } - inline token_type_t scanner_special_char(module_t *m) { + inline token_type_t special_char() { char c = guard_advance(); switch (c) { case '(': @@ -210,6 +228,7 @@ private: default: return token_type_t::TOKEN_NOT_IN_THIS_TYPE; } + } inline std::string string_advance() { @@ -293,6 +312,21 @@ private: } } + inline double number_convert_float(std::string word) { + try { + double decimal = std::stod(word); + return decimal; + } catch (const std::invalid_argument& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Invalid number: " << word << std::endl; + return 0; + } catch (const std::out_of_range& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Number out of range: " << word << std::endl; + return 0; + } + } + inline std::string hex_number_advance() { module.s_cursor.guard += 2; // 跳过 0x @@ -300,7 +334,7 @@ private: guard_advance(); } - return gen_word(); + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); } inline std::string oct_number_advance() { @@ -310,7 +344,7 @@ private: guard_advance(); } - return gen_word(); + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); } inline std::string bin_number_advance() { @@ -320,7 +354,7 @@ private: guard_advance(); } - return gen_word(); + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); } inline std::string number_advance() { @@ -358,118 +392,103 @@ private: case 'a': { switch (word[1]) { case 'r': { - if (word.substr(2, 3) == "ray") { + if (word.substr(2, 3) == "ray" && word.size() == 5) { return TOKEN_ARR; } } } break; } - case 'b': + case 'b': { switch (word[1]) { case 'o': - if (word.substr(2, 2) == "ol") { + if (word.substr(2, 2) == "ol" && word.size() == 4) { return TOKEN_BOOL; + } case 'r': - if (word.substr(2, 3) == "eak") { + if (word.substr(2, 3) == "eak" && word.size() == 5) { return TOKEN_BREAK; } } break; - case 'c': + } + case 'c': { switch (word[1]) { case 'o': - if (word.substr(2, 6) == "ntinue") { + if (word.substr(2, 6) == "ntinue" && word.size() == 8) { return TOKEN_CONTINUE; } } break; - case 'e': - if (word.substr(1, 3) == "lse") { - if (length == 3 && word[3] == 'i') { - return TOKEN_ELSE_IF; - } + } + case 'e': { + if (word.substr(1, 3) == "lse" && word.size() == 4) { return TOKEN_ELSE; } - return scanner_rest(word, length, 1, 3, "lse", TOKEN_ELSE); + } case 'f': { switch (word[1]) { case 'n': - return scanner_rest(word, length, 2, 0, "", TOKEN_FN); + if (word.size() == 2) { + return TOKEN_FN; + } case 'a': - return scanner_rest(word, length, 2, 3, "lse", TOKEN_FALSE); - case 'l': - return scanner_rest(word, length, 2, 3, "oat", TOKEN_FLOAT); + if (word.substr(2, 3) == "lse" && word.size() == 5) { + return TOKEN_FALSE; + } case '3': - return scanner_rest(word, length, 2, 1, "2", TOKEN_F32); + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_F32; + } case '6': - return scanner_rest(word, length, 2, 1, "4", TOKEN_F64); + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_F64; + } case 'o': - return scanner_rest(word, length, 2, 1, "r", TOKEN_FOR); + if (word.substr(2, 1) == "r" && word.size() == 3) { + return TOKEN_FOR; + } } break; } - case 'g': - return scanner_rest(word, length, 1, 1, "o", TOKEN_GO); case 'i': { - if (length == 2 && word[1] == 'n') { - return TOKEN_IN; - } else if (length == 2 && word[1] == 's') { - return TOKEN_IS; - } else if (length == 3 && word[1] == 'n' && word[2] == 't') { - return TOKEN_INT; - } - switch (word[1]) { - case 'm': - return scanner_rest(word, length, 2, 4, "port", TOKEN_IMPORT); case 'f': - return scanner_rest(word, length, 2, 0, "", TOKEN_IF); - case 'n': - return scanner_rest(word, length, 2, 7, "terface", TOKEN_INTERFACE); + if (word.size() == 2) { + return TOKEN_IF; + } case '8': - return scanner_rest(word, length, 2, 0, "", TOKEN_I8); + if (word.size() == 2) { + return TOKEN_I8; + } case '1': - return scanner_rest(word, length, 2, 1, "6", TOKEN_I16); + if (word.substr(2, 1) == "6" && word.size() == 3) { + return TOKEN_I16; + } case '3': - return scanner_rest(word, length, 2, 1, "2", TOKEN_I32); + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_I32; + } case '6': - return scanner_rest(word, length, 2, 1, "4", TOKEN_I64); + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_I64; + } } break; } - case 'l': { - return scanner_rest(word, length, 1, 2, "et", TOKEN_LET); - } - case 'n': - switch (word[1]) { - case 'u': // null - return scanner_rest(word, length, 2, 2, "ll", TOKEN_NULL); - // case 'e':// new, new 识别成 ident 在 parser 采用固定语法结构时才会被识别成 new - // return scanner_rest(word, length, 2, 1, "w", TOKEN_NEW); - } - break; - case 'p': - return scanner_rest(word, length, 1, 2, "tr", TOKEN_PTR); case 's': { // self,string,struct,sizeof,sett - switch (word[1]) { - case 'e': { - switch (word[2]) { - case 't': - return scanner_rest(word, length, 3, 0, "", TOKEN_SET); - case 'l': // select - return scanner_rest(word, length, 3, 3, "ect", TOKEN_SELECT); - } - } - } if (length == 6 && word[1] == 't' && word[2] == 'r') { switch (word[3]) { case 'i': - return scanner_rest(word, length, 4, 2, "ng", TOKEN_STRING); + if (word.substr(4, 2) == "ng" && word.size() == 6) { + return TOKEN_STRING; + } case 'u': - return scanner_rest(word, length, 4, 2, "ct", TOKEN_STRUCT); + if (word.substr(4, 2) == "ct" && word.size() == 6) { + return TOKEN_STRUCT; + } } } break; @@ -477,18 +496,20 @@ private: case 't': { // tup/throw/type/true switch (word[1]) { - case 'h': - return scanner_rest(word, length, 2, 3, "row", TOKEN_THROW); case 'y': // type - return scanner_rest(word, length, 2, 2, "pe", TOKEN_TYPE); + if (word.substr(2, 2) == "pe" && word.size() == 4) { + return TOKEN_TYPE; + } case 'u': // tup - return scanner_rest(word, length, 2, 1, "p", TOKEN_TUP); + if (word.substr(2, 1) == "p" && word.size() == 3) { + return TOKEN_TUP; + } case 'r': { switch (word[2]) { - case 'y': - return scanner_rest(word, length, 3, 0, "", TOKEN_TRY); case 'u': - return scanner_rest(word, length, 3, 1, "e", TOKEN_TRUE); + if (word.substr(3, 1) == "e" && word.size() == 4) { + return TOKEN_TRUE; + } } break; } @@ -498,25 +519,33 @@ private: case 'v': { switch (word[1]) { case 'a': - return scanner_rest(word, length, 2, 1, "r", TOKEN_VAR); + if (word.substr(2, 1) == "r" && word.size() == 3) { + return TOKEN_VAR; + } case 'e': // vec - return scanner_rest(word, length, 2, 1, "c", TOKEN_VEC); - case 'o': // void - return scanner_rest(word, length, 2, 2, "id", TOKEN_VOID); + if (word.substr(2, 1) == "c" && word.size() == 3) { + return TOKEN_VEC; + } } } case 'u': { switch (word[1]) { - case 'i': - return scanner_rest(word, length, 2, 2, "nt", TOKEN_UINT); case '8': - return scanner_rest(word, length, 2, 0, "", TOKEN_U8); + if (word.size() == 2) { + return TOKEN_U8; + } case '1': - return scanner_rest(word, length, 2, 1, "6", TOKEN_U16); + if (word.substr(2, 1) == "6" && word.size() == 3) { + return TOKEN_U16; + } case '3': - return scanner_rest(word, length, 2, 1, "2", TOKEN_U32); + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_U32; + } case '6': - return scanner_rest(word, length, 2, 1, "4", TOKEN_U64); + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_U64; + } } break; } @@ -526,28 +555,136 @@ private: case 'a': { switch (word[2]) { case 'p': - return scanner_rest(word, length, 3, 0, "", TOKEN_MAP); - case 't': - return scanner_rest(word, length, 3, 2, "ch", TOKEN_MATCH); + if (word.size() == 3) { + return TOKEN_MAP; + } } } } } case 'r': { - return scanner_rest(word, length, 1, 5, "eturn", TOKEN_RETURN); + if (word.substr(1, 5) == "eturn" && word.size() == 6) { + // return + return TOKEN_RETURN; + } } } return TOKEN_IDENT; } + inline bool multi_comment_end() { + return module.s_cursor.source[module.s_cursor.guard] == '*' && + module.s_cursor.source[module.s_cursor.guard + 1] == '/'; + } + + inline bool skip_space() { + bool has_new = false; + + if (module.s_cursor.guard != module.s_cursor.current) { + module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1]; + } + + while (true) { + char c = module.s_cursor.source[module.s_cursor.guard]; + switch (c) { + case ' ': + case '\r': + case '\t': { + guard_advance(); + break; + } + case '\n': { + guard_advance(); + has_new = true; + break; + } + case '/': { + if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') { + // 单行注释 + while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) { + guard_advance(); + } + break; + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') { + while (!multi_comment_end()) { + if (at_eof()) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". multi comment not end."; + return false; + } + guard_advance(); + } + + guard_advance(); // 跳过 * + guard_advance(); // 跳过 / + break; + } else { + module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard]; + return has_new; + } + break; + } + default: { + module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard]; + return has_new; + } + } + } + + } + inline token_t item() { module.s_cursor.length = 0; // 重置长度 - module.s_cursor.guard = module.s_cursor.current; // 重置游标位置 + module.s_cursor.current = module.s_cursor.guard; // 重置游标位置 if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) { std::string word = ident_advance(); - return token_t(ident) + return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column); } + + if (is_number(module.s_cursor.source[module.s_cursor.guard])) { + std::string word; + long decimal = 0; + + if (module.s_cursor.source[module.s_cursor.guard] == '0') { + // 可能是十六进制、八进制或二进制 + if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'X') { + word = hex_number_advance(); + decimal = number_convert(word, 16); + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'O') { + word = oct_number_advance(); + decimal = number_convert(word, 8); + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'B') { + word = bin_number_advance(); + decimal = number_convert(word, 2); + } + } else { + word = number_advance(); + decimal = number_convert(word, 10); + } + + token_type_t type; + if (is_float(word)) { + type = TOKEN_LITERAL_FLOAT; + } else { + type = TOKEN_LITERAL_INT; + } + return token_t(type, word, module.s_cursor.line, module.s_cursor.column); + } + if (is_string(module.s_cursor.source[module.s_cursor.guard])) { + std::string word = string_advance(); + return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column); + } + + token_type_t type = special_char(); + + return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column); } -}; \ No newline at end of file + + +}; + diff --git a/include/types.hpp b/include/types.hpp index 185ac10..e85d6fa 100644 --- a/include/types.hpp +++ b/include/types.hpp @@ -11,6 +11,9 @@ struct scanner_cursor_t { int line; // 扫描器当前所在的行 int column; // 扫描器当前所在的列 + + char space_prev; // 记录空行,注释前的上一个字符 + char space_next; }; struct module_t { @@ -27,5 +30,8 @@ struct module_t { s_cursor.length = 0; s_cursor.current = 0; s_cursor.guard = 0; + + s_cursor.space_prev = '\0'; + s_cursor.space_next = '\0'; } }; \ No newline at end of file diff --git a/unit/scanner_test.cpp b/unit/scanner_test.cpp index 3715b0c..4b9e70e 100644 --- a/unit/scanner_test.cpp +++ b/unit/scanner_test.cpp @@ -1,30 +1,17 @@ -#include "Token.h" + #include "doctest.h" #include "stdc++.h" -#include "Scanner.h" -#include "Tbs.h" +#include "types.hpp" +#include "syntax/Scanner.h" +#include "syntax/token.h" #include using std::string,std::vector; -TEST_CASE("Scanner test identifier table") { - Tbs tables; - std::string src = "a += b b<<=casd;"; - Scanner scan(src, tables); - scan.scan(); +TEST_CASE("Scanner test") { + module_t module("{}"); + Scanner scanner(module); - for (auto value_src: scan.get_token_list()) { - std::cout << value_src.id << " " << value_src.type << "\n"; - } + scanner.scan(); } -// TEST_CASE("Scanner test Punct table") { -// Tbs tables = {}; -// std::string src = "+=---<<=>>>===--((([]--<<<>."; -// Scanner scan(src, tables); -// scan.scan(); -// std::cout<<"test\n"; -// for (auto e : scan.get_token_list()) { -// std::cout<