diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b92779..e593a59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,9 +11,7 @@ include_directories(${CMAKE_SOURCE_DIR}/include) file(GLOB SOURCES ${CMAKE_SOURCE_DIR}/src/*.cpp) -add_executable(Hydrogen ${SOURCES} - src/scanner.cpp - src/Tbs.cpp) +add_executable(Hydrogen ${SOURCES}) file(GLOB TEST_SOURCES ${CMAKE_SOURCE_DIR}/unit/*.cpp) diff --git a/include/Scanner.h b/include/Scanner.h deleted file mode 100644 index 3e88f48..0000000 --- a/include/Scanner.h +++ /dev/null @@ -1,58 +0,0 @@ -# pragma once -#include "stdc++.h" -#include "Token.h" -#include "Tbs.h" -#include -#include -#include - -class Scanner { -public: - Scanner(std::string source_code, Tbs tables) - : m_source_code(source_code), m_tables(tables) {} - - - void scan() { - int len = 0; - for (int i = 0; i < m_source_code.size(); i++) { - if (len = process_const_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_identifier_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_key_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_punct_table(i)) { - i += len - 1; - len = 0; - } else if (m_source_code[i] == ' ' || m_source_code[i] == '\t' || m_source_code[i] == '\n') { - continue; - } else { - std::cerr << "Error: Tokenize" << std::endl; - exit(0); - } - - } - } - - inline std::vector get_token_list() { - return m_token_list; - } - - int process_const_table(int i); - int process_identifier_table(int i); - int process_key_table(int i); - int process_punct_table(int i); - - -private: - std::string m_source_code; - std::vector m_token_list; - Tbs m_tables; - int index; - - // 记录标识符表的索引 - int identifier_index = 0; -}; \ No newline at end of file diff --git a/include/Tbs.h b/include/Tbs.h deleted file mode 100644 index ffaeb3a..0000000 --- a/include/Tbs.h +++ /dev/null @@ -1,91 +0,0 @@ -# pragma once -#include "stdc++.h" -#include - -using std::unordered_map,std::string; -class Tbs { -public: - unordered_map ConstTable; - unordered_map IdTable; - std::unordered_map KeyTable = { - {1, "var"}, - {2, "i8"}, - {3, "i16"}, - {4, "i32"}, - {5, "i64"}, - {6, "u8"}, - {7, "u16"}, - {8, "u32"}, - {9, "u64"}, - {10, "float32"}, - {11, "float64"}, - {12, "char"}, - {13, "for"}, - {14, "if"}, - {15, "else"}, - {16, "bool"}, - {17, "string"}, - {18, "vector"}, - {19, "array"}, - {20, "struct"}, - {21, "tuple"}, - {22, "print"}, - {23, "println"} - }; - - std::unordered_map PunctTable = { - {1, "-"}, - {2, "!"}, - {3, "~"}, - {4, "/"}, - {5, "*"}, - {6, "%"}, - {7, "+"}, - {8, "-"}, - {9, "<<"}, - {10, ">>"}, - {11, ">"}, - {12, ">="}, - {13, "<"}, - {14, "<="}, - {15, "=="}, - {16, "!="}, - {17, "&"}, - {18, "^"}, - {19, "|"}, - {20, "&&"}, - {21, "||"}, - {22, "="}, - {23, "%="}, - {24, "*="}, - {25, "/="}, - {26, "+="}, - {27, "-="}, - {28, "|="}, - {29, "&="}, - {30, "^="}, - {31, "<<="}, - {32, ">>="}, - {33, "("}, - {34, ")"}, - {35, "<"}, - {36, ">"}, - {37, ","}, - {38, "."}, - {39, "["}, - {40, "]"}, - {41, "?"}, - {42, ":"}, - {43, "->"}, - {44,";"} - }; - -}; - -enum Table_Type { - CONST_TABLE, - ID_TABLE, - KEY_TABLE, - PUNCT_TABLE -}; - diff --git a/include/Token.h b/include/Token.h deleted file mode 100644 index f5f55ff..0000000 --- a/include/Token.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once -#include "stdc++.h" -#include "Tbs.h" - -struct Token{ - int id; - Table_Type type; -}; \ No newline at end of file diff --git a/include/syntax/Scanner.h b/include/syntax/Scanner.h new file mode 100644 index 0000000..8b9e9f8 --- /dev/null +++ b/include/syntax/Scanner.h @@ -0,0 +1,553 @@ +#pragma once +#include "doctest.h" +#include "token.h" +#include "../types.hpp" +#include +#include +#include +#include + +class Scanner { +public: + Scanner(module_t& module) + : module(module) {} + + +private: + module_t& module; + + inline std::string gen_word() { + return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length); + } + + inline bool is_space(char c) { + if (c == '\n' || c == '\t' || c == '\r' || c == ' ') { + return true; + } + return false; + } + + + inline bool is_string(char s) { + return s == '"'; + } + + inline bool is_float(std::string word) { + // 是否包含 .,包含则为 float + int dot_count = 0; + bool has_e = false; + + for (std::string::size_type i = 0; i < word.size(); i++) { + if (word[i] == '.') + dot_count++; + else if (word[i] == 'e' || word[i] == 'E') + has_e = true; + } + + // 结尾不能是 . + if (word[-1] == '.') { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". floating-point numbers cannot end with '.'"; + return false; + } + + // 如果有科学计数法标记,则认为是浮点数 + if (has_e) { + return true; + } + + if (dot_count == 0) { + return false; + } + + if (dot_count > 1) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". floating-point numbers have multiple '.'"; + return false; + } + + return true; + } + + inline bool is_alpha(char c) { + return std::isalpha(c); + } + + inline bool is_number(char c) { + return std::isdigit(c); + } + + inline bool is_hex_number(char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + } + + inline bool is_oct_number(char c) { + return c >= '0' && c <= '7'; + } + + inline bool is_bin_number(char c) { + return c == '0' || c == '1'; + } + + inline bool at_eof() { + return module.s_cursor.guard == '\0'; + } + + inline char guard_advance() { + module.s_cursor.guard++; + module.s_cursor.length++; + module.s_cursor.column++; + + if (module.s_cursor.source[module.s_cursor.guard] == '\n') { + module.s_cursor.line++; + module.s_cursor.column = 0; + } + + return module.s_cursor.source[module.s_cursor.guard]; + } + + inline bool match(char expected) { + if (at_eof()) + return false; + + if (module.source[module.s_cursor.guard] != expected) + return false; + + guard_advance(); + return true; + } + + inline std::string ident_advance() { + while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) || + is_number(module.s_cursor.source[module.s_cursor.guard])) && + !at_eof()) { + guard_advance(); + } + + return gen_word(); + } + + inline token_type_t scanner_special_char(module_t *m) { + char c = guard_advance(); + switch (c) { + case '(': + return TOKEN_LEFT_PAREN; + case ')': + return TOKEN_RIGHT_PAREN; + case '[': + return TOKEN_LEFT_SQUARE; + case ']': + return TOKEN_RIGHT_SQUARE; + case '{': + return TOKEN_LEFT_CURLY; + case '}': + return TOKEN_RIGHT_CURLY; + case ':': + return TOKEN_COLON; + case ';': + return TOKEN_STMT_EOF; + case ',': + return TOKEN_COMMA; + case '?': + return TOKEN_QUESTION; + case '%': + return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON; + case '-': + if (match('=')) { + return TOKEN_MINUS_EQUAL; + } + if (match('>')) { + return TOKEN_RIGHT_ARROW; + } + + return TOKEN_MINUS; + case '+': + return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS; + case '/': + return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH; + case '*': { + return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR; + } + case '.': { + return TOKEN_DOT; + } + case '!': + return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT; + case '=': + return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL; + case '<': + if (match('<')) { + // << + if (match('=')) { + // <<= + return TOKEN_LEFT_SHIFT_EQUAL; + } + // << + return TOKEN_LEFT_SHIFT; + } else if (match('=')) { + return TOKEN_LESS_EQUAL; + } + return TOKEN_LEFT_ANGLE; + case '>': { + if (match('=')) { + // >= + return TOKEN_GREATER_EQUAL; + } + if (match('>') && match('=')) { + return TOKEN_RIGHT_SHIFT_EQUAL; + } + + return TOKEN_RIGHT_ANGLE; // > + } + case '&': + return match('&') ? TOKEN_AND_AND : TOKEN_AND; + case '|': + return match('|') ? TOKEN_OR_OR : TOKEN_OR; + case '~': + return TOKEN_TILDE; + case '^': + return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR; + default: + return token_type_t::TOKEN_NOT_IN_THIS_TYPE; + } + } + + inline std::string string_advance() { + module.s_cursor.guard++; + char escape_char = '\\'; + + std::stringstream buf; + + while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) { + char guard = module.s_cursor.source[module.s_cursor.guard]; + + if (guard == '\n') { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". string cannot newline."; + } + + // 处理转义字符 + if (guard == escape_char) { + // 跳过转义字符第一个 + module.s_cursor.guard++; + + guard = module.s_cursor.source[module.s_cursor.guard]; + + switch (guard) { + case 'n': + guard = '\n'; + break; + case 't': + guard = '\t'; + break; + case 'r': + guard = '\r'; + break; + case 'b': + guard = '\b'; + break; + case 'f': + guard = '\f'; + break; + case 'a': + guard = '\a'; + break; + case 'v': + guard = '\v'; + break; + case '0': + guard = '\0'; + break; + case '\\': + case '\'': + case '\"': + break; + default: + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". unknown escape char " << guard; + } + } + + buf << guard; + guard_advance(); + } + + //跳过close char + module.s_cursor.guard++; + + return buf.str(); + } + + inline long number_convert(std::string word, int base) { + try { + long decimal = std::stol(word, 0, base); + return decimal; + } catch (const std::invalid_argument& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Invalid number: " << word << std::endl; + return 0; + } catch (const std::out_of_range& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Number out of range: " << word << std::endl; + return 0; + } + } + + inline std::string hex_number_advance() { + module.s_cursor.guard += 2; // 跳过 0x + + while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return gen_word(); + } + + inline std::string oct_number_advance() { + module.s_cursor.guard += 2; // 跳过 0o + + while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return gen_word(); + } + + inline std::string bin_number_advance() { + module.s_cursor.guard += 2; // 跳过 0b + + while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return gen_word(); + } + + inline std::string number_advance() { + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + // 处理小数部分 + if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) { + guard_advance(); // 跳过小数点 + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + } + + // 处理科学计数法 + if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E') + && (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) || + module.s_cursor.source[module.s_cursor.guard + 1] == '+' || + module.s_cursor.source[module.s_cursor.guard + 1] == '-')) { + guard_advance(); // 跳过 e 或 E + if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') { + guard_advance(); // 跳过符号 + } + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + } + + return gen_word(); + } + + static token_type_t scanner_ident(std::string word, int length) { + switch (word[0]) { + case 'a': { + switch (word[1]) { + case 'r': { + if (word.substr(2, 3) == "ray") { + return TOKEN_ARR; + } + } + } + break; + } + case 'b': + switch (word[1]) { + case 'o': + if (word.substr(2, 2) == "ol") { + return TOKEN_BOOL; + case 'r': + if (word.substr(2, 3) == "eak") { + return TOKEN_BREAK; + } + } + break; + case 'c': + switch (word[1]) { + case 'o': + if (word.substr(2, 6) == "ntinue") { + return TOKEN_CONTINUE; + } + } + break; + case 'e': + if (word.substr(1, 3) == "lse") { + if (length == 3 && word[3] == 'i') { + return TOKEN_ELSE_IF; + } + return TOKEN_ELSE; + } + return scanner_rest(word, length, 1, 3, "lse", TOKEN_ELSE); + case 'f': { + switch (word[1]) { + case 'n': + return scanner_rest(word, length, 2, 0, "", TOKEN_FN); + case 'a': + return scanner_rest(word, length, 2, 3, "lse", TOKEN_FALSE); + case 'l': + return scanner_rest(word, length, 2, 3, "oat", TOKEN_FLOAT); + case '3': + return scanner_rest(word, length, 2, 1, "2", TOKEN_F32); + case '6': + return scanner_rest(word, length, 2, 1, "4", TOKEN_F64); + case 'o': + return scanner_rest(word, length, 2, 1, "r", TOKEN_FOR); + } + break; + } + case 'g': + return scanner_rest(word, length, 1, 1, "o", TOKEN_GO); + case 'i': { + if (length == 2 && word[1] == 'n') { + return TOKEN_IN; + } else if (length == 2 && word[1] == 's') { + return TOKEN_IS; + } else if (length == 3 && word[1] == 'n' && word[2] == 't') { + return TOKEN_INT; + } + + switch (word[1]) { + case 'm': + return scanner_rest(word, length, 2, 4, "port", TOKEN_IMPORT); + case 'f': + return scanner_rest(word, length, 2, 0, "", TOKEN_IF); + case 'n': + return scanner_rest(word, length, 2, 7, "terface", TOKEN_INTERFACE); + case '8': + return scanner_rest(word, length, 2, 0, "", TOKEN_I8); + case '1': + return scanner_rest(word, length, 2, 1, "6", TOKEN_I16); + case '3': + return scanner_rest(word, length, 2, 1, "2", TOKEN_I32); + case '6': + return scanner_rest(word, length, 2, 1, "4", TOKEN_I64); + } + break; + } + case 'l': { + return scanner_rest(word, length, 1, 2, "et", TOKEN_LET); + } + case 'n': + switch (word[1]) { + case 'u': // null + return scanner_rest(word, length, 2, 2, "ll", TOKEN_NULL); + // case 'e':// new, new 识别成 ident 在 parser 采用固定语法结构时才会被识别成 new + // return scanner_rest(word, length, 2, 1, "w", TOKEN_NEW); + } + break; + case 'p': + return scanner_rest(word, length, 1, 2, "tr", TOKEN_PTR); + case 's': { + // self,string,struct,sizeof,sett + switch (word[1]) { + case 'e': { + switch (word[2]) { + case 't': + return scanner_rest(word, length, 3, 0, "", TOKEN_SET); + case 'l': // select + return scanner_rest(word, length, 3, 3, "ect", TOKEN_SELECT); + } + } + } + + if (length == 6 && word[1] == 't' && word[2] == 'r') { + switch (word[3]) { + case 'i': + return scanner_rest(word, length, 4, 2, "ng", TOKEN_STRING); + case 'u': + return scanner_rest(word, length, 4, 2, "ct", TOKEN_STRUCT); + } + } + break; + } + case 't': { + // tup/throw/type/true + switch (word[1]) { + case 'h': + return scanner_rest(word, length, 2, 3, "row", TOKEN_THROW); + case 'y': // type + return scanner_rest(word, length, 2, 2, "pe", TOKEN_TYPE); + case 'u': // tup + return scanner_rest(word, length, 2, 1, "p", TOKEN_TUP); + case 'r': { + switch (word[2]) { + case 'y': + return scanner_rest(word, length, 3, 0, "", TOKEN_TRY); + case 'u': + return scanner_rest(word, length, 3, 1, "e", TOKEN_TRUE); + } + break; + } + } + break; + } + case 'v': { + switch (word[1]) { + case 'a': + return scanner_rest(word, length, 2, 1, "r", TOKEN_VAR); + case 'e': // vec + return scanner_rest(word, length, 2, 1, "c", TOKEN_VEC); + case 'o': // void + return scanner_rest(word, length, 2, 2, "id", TOKEN_VOID); + } + } + case 'u': { + switch (word[1]) { + case 'i': + return scanner_rest(word, length, 2, 2, "nt", TOKEN_UINT); + case '8': + return scanner_rest(word, length, 2, 0, "", TOKEN_U8); + case '1': + return scanner_rest(word, length, 2, 1, "6", TOKEN_U16); + case '3': + return scanner_rest(word, length, 2, 1, "2", TOKEN_U32); + case '6': + return scanner_rest(word, length, 2, 1, "4", TOKEN_U64); + } + break; + } + case 'm': { + // map + switch (word[1]) { + case 'a': { + switch (word[2]) { + case 'p': + return scanner_rest(word, length, 3, 0, "", TOKEN_MAP); + case 't': + return scanner_rest(word, length, 3, 2, "ch", TOKEN_MATCH); + } + } + } + } + case 'r': { + return scanner_rest(word, length, 1, 5, "eturn", TOKEN_RETURN); + } + } + + return TOKEN_IDENT; + } + + inline token_t item() { + module.s_cursor.length = 0; // 重置长度 + module.s_cursor.guard = module.s_cursor.current; // 重置游标位置 + + if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) { + std::string word = ident_advance(); + return token_t(ident) + } + } +}; \ No newline at end of file diff --git a/include/syntax/token.h b/include/syntax/token.h new file mode 100644 index 0000000..c0b1561 --- /dev/null +++ b/include/syntax/token.h @@ -0,0 +1,201 @@ +#pragma once +#include +#include +#include + +#define DEBUG_SCANNER + +enum token_type_t { + TOKEN_NOT_IN_THIS_TYPE = 0, + TOKEN_LEFT_PAREN, + TOKEN_RIGHT_PAREN,// () + TOKEN_LEFT_SQUARE, + TOKEN_RIGHT_SQUARE,// [] + TOKEN_LEFT_CURLY, + TOKEN_RIGHT_CURLY,// {} + TOKEN_LEFT_ANGLE, // < + TOKEN_LESS_THAN, // < + TOKEN_RIGHT_ANGLE,// > + + TOKEN_COMMA, // , + TOKEN_DOT, // . + TOKEN_MINUS, // - + TOKEN_PLUS, // + + TOKEN_COLON, // : + TOKEN_SEMICOLON, // ; + TOKEN_SLASH, // / + TOKEN_STAR, // a * b, *a + TOKEN_PERSON, // % + TOKEN_QUESTION, // ? + TOKEN_RIGHT_ARROW,// -> + + TOKEN_NOT,// ! + TOKEN_NOT_EQUAL, + TOKEN_EQUAL, + TOKEN_EQUAL_EQUAL, + TOKEN_GREATER_EQUAL,// >= + TOKEN_LESS_EQUAL, // <= + TOKEN_AND_AND, // && + TOKEN_OR_OR, // || + + TOKEN_PLUS_EQUAL, // += + TOKEN_MINUS_EQUAL, // -= + TOKEN_STAR_EQUAL, // *= + TOKEN_SLASH_EQUAL, // /= + TOKEN_PERSON_EQUAL, // %= + TOKEN_AND_EQUAL, // &= + TOKEN_OR_EQUAL, // |= + TOKEN_XOR_EQUAL, // ^= + TOKEN_LEFT_SHIFT_EQUAL, // <<= + TOKEN_RIGHT_SHIFT_EQUAL,// >>= + + // 位运算 + TOKEN_TILDE, // ~ + TOKEN_AND, // & + TOKEN_OR, // | + TOKEN_XOR, // ^ + TOKEN_LEFT_SHIFT, // << + TOKEN_RIGHT_SHIFT,// >> + + // 字面量 + TOKEN_IDENT, // 标识符 + TOKEN_LITERAL_STRING, + TOKEN_LITERAL_FLOAT, + TOKEN_LITERAL_INT, + + // 类型 + TOKEN_STRING, + TOKEN_BOOL, + TOKEN_U8, + TOKEN_U16, + TOKEN_U32, + TOKEN_U64, + TOKEN_I8, + TOKEN_I16, + TOKEN_I32, + TOKEN_I64, + TOKEN_F32, + TOKEN_F64, + + // 内置复合类型 + TOKEN_ARR, + TOKEN_VEC, + TOKEN_MAP, + TOKEN_TUP, + + // 关键字 + TOKEN_VAR, + TOKEN_TRUE, + TOKEN_FALSE, + TOKEN_TYPE, + TOKEN_STRUCT, + TOKEN_CONTINUE, + TOKEN_BREAK, + TOKEN_FOR, + TOKEN_IN, + TOKEN_IF, + TOKEN_ELSE, + TOKEN_ELSE_IF, + TOKEN_FN, + TOKEN_RETURN, + TOKEN_STMT_EOF, // ; + TOKEN_EOF,// TOKEN_EOF 一定要在最后一个,否则会索引溢出 +}; + +inline static std::unordered_map token_str = { + {TOKEN_LEFT_PAREN, "("}, + {TOKEN_RIGHT_PAREN, ")"}, + {TOKEN_LEFT_SQUARE, "["}, + {TOKEN_RIGHT_SQUARE, "]"}, + {TOKEN_LEFT_CURLY, "{"}, + {TOKEN_RIGHT_CURLY, "}"}, + {TOKEN_LEFT_ANGLE, "<"}, + {TOKEN_LESS_THAN, "<"}, + {TOKEN_RIGHT_ANGLE, ">"}, + {TOKEN_COMMA, ","}, + {TOKEN_DOT, "."}, + {TOKEN_MINUS, "-"}, + {TOKEN_PLUS, "+"}, + {TOKEN_COLON, ":"}, + {TOKEN_SEMICOLON, ";"}, + {TOKEN_SLASH, "/"}, + {TOKEN_STAR, "*"}, + {TOKEN_PERSON, "%"}, + {TOKEN_QUESTION, "?"}, + {TOKEN_RIGHT_ARROW, "->"}, + {TOKEN_NOT, "!"}, + {TOKEN_NOT_EQUAL, "!="}, + {TOKEN_EQUAL, "="}, + {TOKEN_EQUAL_EQUAL, "=="}, + {TOKEN_GREATER_EQUAL, ">="}, + {TOKEN_LESS_EQUAL, "<="}, + {TOKEN_AND_AND, "&&"}, + {TOKEN_OR_OR, "||"}, + {TOKEN_PLUS_EQUAL, "+="}, + {TOKEN_MINUS_EQUAL, "-="}, + {TOKEN_STAR_EQUAL, "*="}, + {TOKEN_SLASH_EQUAL, "/="}, + {TOKEN_PERSON_EQUAL, "%="}, + {TOKEN_AND_EQUAL, "&="}, + {TOKEN_OR_EQUAL, "|="}, + {TOKEN_XOR_EQUAL, "^="}, + {TOKEN_LEFT_SHIFT_EQUAL, "<<="}, + {TOKEN_RIGHT_SHIFT_EQUAL, ">>="}, + {TOKEN_TILDE, "~"}, + {TOKEN_AND, "&"}, + {TOKEN_OR, "|"}, + {TOKEN_XOR, "^"}, + {TOKEN_LEFT_SHIFT, "<<"}, + {TOKEN_RIGHT_SHIFT, ">>"}, + {TOKEN_IDENT, "ident_literal"}, + {TOKEN_LITERAL_STRING, "string_literal"}, + {TOKEN_LITERAL_FLOAT, "float_literal"}, + {TOKEN_LITERAL_INT, "int_literal"}, + {TOKEN_STRING, "string"}, + {TOKEN_BOOL, "bool"}, + {TOKEN_U8, "u8"}, + {TOKEN_U16, "u16"}, + {TOKEN_U32, "u32"}, + {TOKEN_U64, "u64"}, + {TOKEN_I8, "i8"}, + {TOKEN_I16, "i16"}, + {TOKEN_I32, "i32"}, + {TOKEN_I64, "i64"}, + {TOKEN_F32, "f32"}, + {TOKEN_F64, "f64"}, + {TOKEN_ARR, "arr"}, + {TOKEN_VEC, "vec"}, + {TOKEN_MAP, "map"}, + {TOKEN_TUP, "tup"}, + {TOKEN_VAR, "var"}, + {TOKEN_TRUE, "true"}, + {TOKEN_FALSE, "false"}, + {TOKEN_TYPE, "type"}, + {TOKEN_STRUCT, "struct"}, + {TOKEN_CONTINUE, "continue"}, + {TOKEN_BREAK, "break"}, + {TOKEN_FOR, "for"}, + {TOKEN_IN, "in"}, + {TOKEN_IF, "if"}, + {TOKEN_ELSE, "else"}, + {TOKEN_ELSE_IF, "else if"}, + {TOKEN_FN, "fn"}, + {TOKEN_RETURN, "return"}, + {TOKEN_STMT_EOF, ";"}, + {TOKEN_EOF, "\0"} +}; + +struct token_t { + token_type_t type; + std::string literal; + int line; + int column; + int length; + + token_t(token_type_t token_type, std::string literal, int line, int column) + : type(token_type), literal(literal), line(line), column(column), length(literal.size()) { +#ifdef DEBUG_SCANNER + std::cout << "[DEBUG] SCANNER line: " << line << ", type: " << token_str[token_type] << ", literal: " << literal << std::endl; +#endif + } +}; \ No newline at end of file diff --git a/include/types.hpp b/include/types.hpp new file mode 100644 index 0000000..185ac10 --- /dev/null +++ b/include/types.hpp @@ -0,0 +1,31 @@ +#pragma once +#include +#include +#include "syntax/token.h" + +struct scanner_cursor_t { + std::string source; + std::string::size_type current; + std::string::size_type guard; + int length; + + int line; // 扫描器当前所在的行 + int column; // 扫描器当前所在的列 +}; + +struct module_t { + std::string source; + + scanner_cursor_t s_cursor; + std::vector token_list; + + module_t(std::string source) + : source(source) { + s_cursor.source = source; + s_cursor.line = 1; + s_cursor.column = 1; + s_cursor.length = 0; + s_cursor.current = 0; + s_cursor.guard = 0; + } +}; \ No newline at end of file diff --git a/src/Scanner.cpp b/src/Scanner.cpp deleted file mode 100644 index 7387059..0000000 --- a/src/Scanner.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include "Scanner.h" -#include - -int Scanner::process_const_table(int index) { - return 0; -} - -int Scanner::process_identifier_table(int index) { - std::stringstream buffer; - int old_index = index; - if (std::isalpha(m_source_code[index])) { - buffer << m_source_code[index]; - index += 1; - while(std::isalnum(m_source_code[index])) { - buffer << m_source_code[index]; - index += 1; - } - - std::string identifier = buffer.str(); - for (const auto& key : m_tables.KeyTable) { - if (identifier == key.second) { - return 0; - } - } - - m_tables.IdTable.insert({identifier_index, identifier}); - m_token_list.push_back(Token{identifier_index, ID_TABLE}); - identifier_index++; - return index - old_index; - } else { - return 0; - } - -} - -int Scanner::process_key_table(int index) { - return 0; -} - -int Scanner::process_punct_table(int index) { - //identify the Punct in map - string s; - int n = this->m_source_code.size(); - char c1= this->m_source_code[index]; - char c2 = '@'; - char c3 = '@'; - if (index + 1 < n) c2 = this->m_source_code[index + 1]; - if (index + 2 < n) c3 = this->m_source_code[index + 2]; - if (((c1 == c2 && c2 == '<') || (c1 == c2&& c2 == '>'))&& c3 =='=') { - this->m_token_list.push_back({c1=='<'?31:32,PUNCT_TABLE}); - return 3; - } - if (c1 == c2 ) { - if (c1 =='=') { - this->m_token_list.push_back({15,PUNCT_TABLE}); - return 2; - } - if (c1 =='|') { - this->m_token_list.push_back({21,PUNCT_TABLE}); - return 2; - } - if (c1=='&') { - this->m_token_list.push_back({20,PUNCT_TABLE}); - return 2; - } - if (c1=='<') { - this->m_token_list.push_back({9,PUNCT_TABLE}); - return 2; - } - if (c1=='>') { - this->m_token_list.push_back({10,PUNCT_TABLE}); - return 2; - } - } - - string t; - t.push_back(c1); - t.push_back(c2); - for (auto e : this->m_tables.PunctTable) { - if (e.second == t) { - this->m_token_list.push_back({e.first,PUNCT_TABLE}); - return 2; - } - } - - t.pop_back(); - for (auto e : this->m_tables.PunctTable) { - if (e.second == t) { - this->m_token_list.push_back({e.first,PUNCT_TABLE}); - return 1; - } - } - return 0; -} diff --git a/src/Tbs.cpp b/src/Tbs.cpp deleted file mode 100644 index 728e99a..0000000 --- a/src/Tbs.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "Tbs.h"