diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b92779..e593a59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,9 +11,7 @@ include_directories(${CMAKE_SOURCE_DIR}/include) file(GLOB SOURCES ${CMAKE_SOURCE_DIR}/src/*.cpp) -add_executable(Hydrogen ${SOURCES} - src/scanner.cpp - src/Tbs.cpp) +add_executable(Hydrogen ${SOURCES}) file(GLOB TEST_SOURCES ${CMAKE_SOURCE_DIR}/unit/*.cpp) diff --git a/include/Scanner.h b/include/Scanner.h deleted file mode 100644 index 3e88f48..0000000 --- a/include/Scanner.h +++ /dev/null @@ -1,58 +0,0 @@ -# pragma once -#include "stdc++.h" -#include "Token.h" -#include "Tbs.h" -#include -#include -#include - -class Scanner { -public: - Scanner(std::string source_code, Tbs tables) - : m_source_code(source_code), m_tables(tables) {} - - - void scan() { - int len = 0; - for (int i = 0; i < m_source_code.size(); i++) { - if (len = process_const_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_identifier_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_key_table(i)) { - i += len - 1; - len = 0; - } else if (len = process_punct_table(i)) { - i += len - 1; - len = 0; - } else if (m_source_code[i] == ' ' || m_source_code[i] == '\t' || m_source_code[i] == '\n') { - continue; - } else { - std::cerr << "Error: Tokenize" << std::endl; - exit(0); - } - - } - } - - inline std::vector get_token_list() { - return m_token_list; - } - - int process_const_table(int i); - int process_identifier_table(int i); - int process_key_table(int i); - int process_punct_table(int i); - - -private: - std::string m_source_code; - std::vector m_token_list; - Tbs m_tables; - int index; - - // 记录标识符表的索引 - int identifier_index = 0; -}; \ No newline at end of file diff --git a/include/Tbs.h b/include/Tbs.h deleted file mode 100644 index ffaeb3a..0000000 --- a/include/Tbs.h +++ /dev/null @@ -1,91 +0,0 @@ -# pragma once -#include "stdc++.h" -#include - -using std::unordered_map,std::string; -class Tbs { -public: - unordered_map ConstTable; - unordered_map IdTable; - std::unordered_map KeyTable = { - {1, "var"}, - {2, "i8"}, - {3, "i16"}, - {4, "i32"}, - {5, "i64"}, - {6, "u8"}, - {7, "u16"}, - {8, "u32"}, - {9, "u64"}, - {10, "float32"}, - {11, "float64"}, - {12, "char"}, - {13, "for"}, - {14, "if"}, - {15, "else"}, - {16, "bool"}, - {17, "string"}, - {18, "vector"}, - {19, "array"}, - {20, "struct"}, - {21, "tuple"}, - {22, "print"}, - {23, "println"} - }; - - std::unordered_map PunctTable = { - {1, "-"}, - {2, "!"}, - {3, "~"}, - {4, "/"}, - {5, "*"}, - {6, "%"}, - {7, "+"}, - {8, "-"}, - {9, "<<"}, - {10, ">>"}, - {11, ">"}, - {12, ">="}, - {13, "<"}, - {14, "<="}, - {15, "=="}, - {16, "!="}, - {17, "&"}, - {18, "^"}, - {19, "|"}, - {20, "&&"}, - {21, "||"}, - {22, "="}, - {23, "%="}, - {24, "*="}, - {25, "/="}, - {26, "+="}, - {27, "-="}, - {28, "|="}, - {29, "&="}, - {30, "^="}, - {31, "<<="}, - {32, ">>="}, - {33, "("}, - {34, ")"}, - {35, "<"}, - {36, ">"}, - {37, ","}, - {38, "."}, - {39, "["}, - {40, "]"}, - {41, "?"}, - {42, ":"}, - {43, "->"}, - {44,";"} - }; - -}; - -enum Table_Type { - CONST_TABLE, - ID_TABLE, - KEY_TABLE, - PUNCT_TABLE -}; - diff --git a/include/Token.h b/include/Token.h deleted file mode 100644 index f5f55ff..0000000 --- a/include/Token.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once -#include "stdc++.h" -#include "Tbs.h" - -struct Token{ - int id; - Table_Type type; -}; \ No newline at end of file diff --git a/include/input.txt b/include/input.txt new file mode 100644 index 0000000..eb79b32 --- /dev/null +++ b/include/input.txt @@ -0,0 +1,52 @@ +//Merge Sort + +struct Point { + x:i8; + y:i8; +} +// Struct + +/* +ababa +*/ +[Point:105] tmp; + +fn MergeSort([Point:20] v,i8 l,i8 r) -> { + if l>r { + return ; + } + var mid = l + r >>1; + MergeSort(v,l,mid); + MergeSort(v,mid+1,r); + i8 i=l,j=mid+1,k=l; + for ;i <= mid && j <= r;k+=1 { + if v[l] < v[r] + { + tmp[k] = v[l]; + l +=1; + } + else{ + tmp[k] = v[r]; + r += 1; + } + } + for ; i<=mid; { + tmp[k] = tmp[i]; + k += 1,i+=1; + } + for ; j <=r ; { + tmp[k] = tmp[j]; + k +=1 , j += 1; + } + +} +fn main()->i8{ + [Point:20] d; + [Point] d; + + for i8 i = 0;i< 20; i++ { + d[i] = {x:i * i,y:i}; + } + MergeSort(d,d+20); + 0 +} \ No newline at end of file diff --git a/include/syntax/Scanner.h b/include/syntax/Scanner.h new file mode 100644 index 0000000..e369c46 --- /dev/null +++ b/include/syntax/Scanner.h @@ -0,0 +1,696 @@ +#pragma once +#include "doctest.h" +#include "token.h" +#include "../types.hpp" +#include +#include +#include +#include + +class Scanner { +public: + Scanner(module_t& module) + : module(module) {} + + inline std::vector scan() { + std::vector tokens; + + while (!at_eof()) { + + if (skip_space()) { + // 如果是空格或换行,则跳过 + continue; + } + + token_t token = item(); + tokens.push_back(token); + } + + tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column)); + return tokens; + } + + +private: + module_t& module; + + inline std::string gen_word() { + return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length); + } + + inline bool is_space(char c) { + if (c == '\n' || c == '\t' || c == '\r' || c == ' ') { + return true; + } + return false; + } + + + inline bool is_string(char s) { + return s == '"'; + } + + inline bool is_float(std::string word) { + // 是否包含 .,包含则为 float + int dot_count = 0; + bool has_e = false; + + for (std::string::size_type i = 0; i < word.size(); i++) { + if (word[i] == '.') + dot_count++; + else if (word[i] == 'e' || word[i] == 'E') + has_e = true; + } + + // 结尾不能是 . + if (word[-1] == '.') { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". floating-point numbers cannot end with '.'"; + return false; + } + + // 如果有科学计数法标记,则认为是浮点数 + if (has_e) { + return true; + } + + if (dot_count == 0) { + return false; + } + + if (dot_count > 1) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". floating-point numbers have multiple '.'"; + return false; + } + + return true; + } + + inline bool is_alpha(char c) { + return std::isalpha(c); + } + + inline bool is_number(char c) { + return std::isdigit(c); + } + + inline bool is_hex_number(char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + } + + inline bool is_oct_number(char c) { + return c >= '0' && c <= '7'; + } + + inline bool is_bin_number(char c) { + return c == '0' || c == '1'; + } + + inline bool at_eof() { + return module.s_cursor.source[module.s_cursor.guard] == '\0'; + } + + inline char guard_advance() { + module.s_cursor.guard++; + module.s_cursor.length++; + module.s_cursor.column++; + + if (module.s_cursor.source[module.s_cursor.guard] == '\n') { + module.s_cursor.line++; + module.s_cursor.column = 0; + } + + return module.s_cursor.source[module.s_cursor.guard]; + } + + inline bool match(char expected) { + if (at_eof()) + return false; + + if (module.source[module.s_cursor.guard] != expected) + return false; + + guard_advance(); + return true; + } + + inline std::string ident_advance() { + while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) || + is_number(module.s_cursor.source[module.s_cursor.guard])) && + !at_eof()) { + guard_advance(); + + } + + return gen_word(); + } + + inline token_type_t special_char() { + char c = module.s_cursor.source[module.s_cursor.guard]; + guard_advance(); + switch (c) { + case '(': + return TOKEN_LEFT_PAREN; + case ')': + return TOKEN_RIGHT_PAREN; + case '[': + return TOKEN_LEFT_SQUARE; + case ']': + return TOKEN_RIGHT_SQUARE; + case '{': + return TOKEN_LEFT_CURLY; + case '}': + return TOKEN_RIGHT_CURLY; + case ':': + return TOKEN_COLON; + case ';': + return TOKEN_STMT_EOF; + case ',': + return TOKEN_COMMA; + case '?': + return TOKEN_QUESTION; + case '%': + return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON; + case '-': + if (match('=')) { + return TOKEN_MINUS_EQUAL; + } + if (match('>')) { + return TOKEN_RIGHT_ARROW; + } + + return TOKEN_MINUS; + case '+': + return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS; + case '/': + return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH; + case '*': { + return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR; + } + case '.': { + return TOKEN_DOT; + } + case '!': + return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT; + case '=': + return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL; + case '<': + if (match('<')) { + // << + if (match('=')) { + // <<= + return TOKEN_LEFT_SHIFT_EQUAL; + } + // << + return TOKEN_LEFT_SHIFT; + } else if (match('=')) { + return TOKEN_LESS_EQUAL; + } + return TOKEN_LEFT_ANGLE; + case '>': { + if (match('=')) { + // >= + return TOKEN_GREATER_EQUAL; + } + if (match('>') && match('=')) { + return TOKEN_RIGHT_SHIFT_EQUAL; + } + + return TOKEN_RIGHT_ANGLE; // > + } + case '&': + return match('&') ? TOKEN_AND_AND : TOKEN_AND; + case '|': + return match('|') ? TOKEN_OR_OR : TOKEN_OR; + case '~': + return TOKEN_TILDE; + case '^': + return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR; + default: + return token_type_t::TOKEN_NOT_IN_THIS_TYPE; + } + + } + + inline std::string string_advance() { + module.s_cursor.guard++; + char escape_char = '\\'; + + std::stringstream buf; + + while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) { + char guard = module.s_cursor.source[module.s_cursor.guard]; + + if (guard == '\n') { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". string cannot newline."; + } + + // 处理转义字符 + if (guard == escape_char) { + // 跳过转义字符第一个 + module.s_cursor.guard++; + + guard = module.s_cursor.source[module.s_cursor.guard]; + + switch (guard) { + case 'n': + guard = '\n'; + break; + case 't': + guard = '\t'; + break; + case 'r': + guard = '\r'; + break; + case 'b': + guard = '\b'; + break; + case 'f': + guard = '\f'; + break; + case 'a': + guard = '\a'; + break; + case 'v': + guard = '\v'; + break; + case '0': + guard = '\0'; + break; + case '\\': + case '\'': + case '\"': + break; + default: + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". unknown escape char " << guard; + } + } + + buf << guard; + guard_advance(); + } + + //跳过close char + module.s_cursor.guard++; + + return buf.str(); + } + + inline long number_convert(std::string word, int base) { + try { + long decimal = std::stol(word, 0, base); + return decimal; + } catch (const std::invalid_argument& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Invalid number: " << word << std::endl; + return 0; + } catch (const std::out_of_range& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Number out of range: " << word << std::endl; + return 0; + } + } + + inline double number_convert_float(std::string word) { + try { + double decimal = std::stod(word); + return decimal; + } catch (const std::invalid_argument& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Invalid number: " << word << std::endl; + return 0; + } catch (const std::out_of_range& e) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". Number out of range: " << word << std::endl; + return 0; + } + } + + inline std::string hex_number_advance() { + module.s_cursor.guard += 2; // 跳过 0x + + while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); + } + + inline std::string oct_number_advance() { + module.s_cursor.guard += 2; // 跳过 0o + + while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); + } + + inline std::string bin_number_advance() { + module.s_cursor.guard += 2; // 跳过 0b + + while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length); + } + + inline std::string number_advance() { + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + + // 处理小数部分 + if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) { + guard_advance(); // 跳过小数点 + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + } + + // 处理科学计数法 + if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E') + && (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) || + module.s_cursor.source[module.s_cursor.guard + 1] == '+' || + module.s_cursor.source[module.s_cursor.guard + 1] == '-')) { + guard_advance(); // 跳过 e 或 E + if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') { + guard_advance(); // 跳过符号 + } + while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) { + guard_advance(); + } + } + + return gen_word(); + } + + static token_type_t scanner_ident(std::string word, int length) { + switch (word[0]) { + case 'a': { + switch (word[1]) { + case 'r': { + if (word.substr(2, 3) == "ray" && word.size() == 5) { + return TOKEN_ARR; + } + } + } + break; + } + case 'b': { + switch (word[1]) { + case 'o': + if (word.substr(2, 2) == "ol" && word.size() == 4) { + return TOKEN_BOOL; + } + case 'r': + if (word.substr(2, 3) == "eak" && word.size() == 5) { + return TOKEN_BREAK; + } + } + break; + } + case 'c': { + switch (word[1]) { + case 'o': + if (word.substr(2, 6) == "ntinue" && word.size() == 8) { + return TOKEN_CONTINUE; + } + } + break; + } + case 'e': { + if (word.substr(1, 3) == "lse" && word.size() == 4) { + return TOKEN_ELSE; + } + } + case 'f': { + switch (word[1]) { + case 'n': + if (word.size() == 2) { + return TOKEN_FN; + } + case 'a': + if (word.substr(2, 3) == "lse" && word.size() == 5) { + return TOKEN_FALSE; + } + case '3': + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_F32; + } + case '6': + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_F64; + } + case 'o': + if (word.substr(2, 1) == "r" && word.size() == 3) { + return TOKEN_FOR; + } + } + break; + } + case 'i': { + switch (word[1]) { + case 'f': + if (word.size() == 2) { + return TOKEN_IF; + } + case '8': + if (word.size() == 2) { + return TOKEN_I8; + } + case '1': + if (word.substr(2, 1) == "6" && word.size() == 3) { + return TOKEN_I16; + } + case '3': + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_I32; + } + case '6': + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_I64; + } + } + break; + } + case 's': { + // self,string,struct,sizeof,sett + + if (length == 6 && word[1] == 't' && word[2] == 'r') { + switch (word[3]) { + case 'i': + if (word.substr(4, 2) == "ng" && word.size() == 6) { + return TOKEN_STRING; + } + case 'u': + if (word.substr(4, 2) == "ct" && word.size() == 6) { + return TOKEN_STRUCT; + } + } + } + break; + } + case 't': { + // tup/throw/type/true + switch (word[1]) { + case 'y': // type + if (word.substr(2, 2) == "pe" && word.size() == 4) { + return TOKEN_TYPE; + } + case 'u': // tup + if (word.substr(2, 1) == "p" && word.size() == 3) { + return TOKEN_TUP; + } + case 'r': { + switch (word[2]) { + case 'u': + if (word.substr(3, 1) == "e" && word.size() == 4) { + return TOKEN_TRUE; + } + } + break; + } + } + break; + } + case 'v': { + switch (word[1]) { + case 'a': + if (word.substr(2, 1) == "r" && word.size() == 3) { + return TOKEN_VAR; + } + case 'e': // vec + if (word.substr(2, 1) == "c" && word.size() == 3) { + return TOKEN_VEC; + } + } + } + case 'u': { + switch (word[1]) { + case '8': + if (word.size() == 2) { + return TOKEN_U8; + } + case '1': + if (word.substr(2, 1) == "6" && word.size() == 3) { + return TOKEN_U16; + } + case '3': + if (word.substr(2, 1) == "2" && word.size() == 3) { + return TOKEN_U32; + } + case '6': + if (word.substr(2, 1) == "4" && word.size() == 3) { + return TOKEN_U64; + } + } + break; + } + case 'm': { + // map + switch (word[1]) { + case 'a': { + switch (word[2]) { + case 'p': + if (word.size() == 3) { + return TOKEN_MAP; + } + } + } + } + } + case 'r': { + if (word.substr(1, 5) == "eturn" && word.size() == 6) { + // return + return TOKEN_RETURN; + } + } + } + + return TOKEN_IDENT; + } + + inline bool multi_comment_end() { + return module.s_cursor.source[module.s_cursor.guard] == '*' && + module.s_cursor.source[module.s_cursor.guard + 1] == '/'; + } + + inline bool skip_space() { + bool has_new = false; + + if (module.s_cursor.guard != module.s_cursor.current) { + module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1]; + } + + while (true) { + char c = module.s_cursor.source[module.s_cursor.guard]; + switch (c) { + case ' ': + case '\r': + case '\t': { + guard_advance(); + break; + } + case '\n': { + guard_advance(); + has_new = true; + break; + } + case '/': { + if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') { + // 单行注释 + while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) { + guard_advance(); + } + break; + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') { + while (!multi_comment_end()) { + if (at_eof()) { + std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " + << module.s_cursor.column <<". multi comment not end."; + return false; + } + guard_advance(); + } + + guard_advance(); // 跳过 * + guard_advance(); // 跳过 / + break; + } else { + module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard]; + return has_new; + } + break; + } + default: { + module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard]; + return has_new; + } + } + } + + } + + inline token_t item() { + module.s_cursor.length = 0; // 重置长度 + module.s_cursor.current = module.s_cursor.guard; // 重置游标位置 + if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) { + std::string word = ident_advance(); + return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column); + } + + if (is_number(module.s_cursor.source[module.s_cursor.guard])) { + std::string word; + long decimal = 0; + + if (module.s_cursor.source[module.s_cursor.guard] == '0') { + // 可能是十六进制、八进制或二进制 + if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'X') { + word = hex_number_advance(); + decimal = number_convert(word, 16); + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'O') { + word = oct_number_advance(); + decimal = number_convert(word, 8); + } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' || + module.s_cursor.source[module.s_cursor.guard + 1] == 'B') { + word = bin_number_advance(); + decimal = number_convert(word, 2); + } + else { + word = number_advance(); + if (word.size() > 1) word = word.substr(1, word.size() - 1); + decimal = number_convert(word,10); + } + } else { + word = number_advance(); + decimal = number_convert(word, 10); + } + + token_type_t type; + if (is_float(word)) { + type = TOKEN_LITERAL_FLOAT; + } else { + type = TOKEN_LITERAL_INT; + } + return token_t(type, word, module.s_cursor.line, module.s_cursor.column); + } + if (is_string(module.s_cursor.source[module.s_cursor.guard])) { + std::string word = string_advance(); + return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column); + } + + token_type_t type = special_char(); + + return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column); + } + + +}; + diff --git a/include/syntax/token.h b/include/syntax/token.h new file mode 100644 index 0000000..c0b1561 --- /dev/null +++ b/include/syntax/token.h @@ -0,0 +1,201 @@ +#pragma once +#include +#include +#include + +#define DEBUG_SCANNER + +enum token_type_t { + TOKEN_NOT_IN_THIS_TYPE = 0, + TOKEN_LEFT_PAREN, + TOKEN_RIGHT_PAREN,// () + TOKEN_LEFT_SQUARE, + TOKEN_RIGHT_SQUARE,// [] + TOKEN_LEFT_CURLY, + TOKEN_RIGHT_CURLY,// {} + TOKEN_LEFT_ANGLE, // < + TOKEN_LESS_THAN, // < + TOKEN_RIGHT_ANGLE,// > + + TOKEN_COMMA, // , + TOKEN_DOT, // . + TOKEN_MINUS, // - + TOKEN_PLUS, // + + TOKEN_COLON, // : + TOKEN_SEMICOLON, // ; + TOKEN_SLASH, // / + TOKEN_STAR, // a * b, *a + TOKEN_PERSON, // % + TOKEN_QUESTION, // ? + TOKEN_RIGHT_ARROW,// -> + + TOKEN_NOT,// ! + TOKEN_NOT_EQUAL, + TOKEN_EQUAL, + TOKEN_EQUAL_EQUAL, + TOKEN_GREATER_EQUAL,// >= + TOKEN_LESS_EQUAL, // <= + TOKEN_AND_AND, // && + TOKEN_OR_OR, // || + + TOKEN_PLUS_EQUAL, // += + TOKEN_MINUS_EQUAL, // -= + TOKEN_STAR_EQUAL, // *= + TOKEN_SLASH_EQUAL, // /= + TOKEN_PERSON_EQUAL, // %= + TOKEN_AND_EQUAL, // &= + TOKEN_OR_EQUAL, // |= + TOKEN_XOR_EQUAL, // ^= + TOKEN_LEFT_SHIFT_EQUAL, // <<= + TOKEN_RIGHT_SHIFT_EQUAL,// >>= + + // 位运算 + TOKEN_TILDE, // ~ + TOKEN_AND, // & + TOKEN_OR, // | + TOKEN_XOR, // ^ + TOKEN_LEFT_SHIFT, // << + TOKEN_RIGHT_SHIFT,// >> + + // 字面量 + TOKEN_IDENT, // 标识符 + TOKEN_LITERAL_STRING, + TOKEN_LITERAL_FLOAT, + TOKEN_LITERAL_INT, + + // 类型 + TOKEN_STRING, + TOKEN_BOOL, + TOKEN_U8, + TOKEN_U16, + TOKEN_U32, + TOKEN_U64, + TOKEN_I8, + TOKEN_I16, + TOKEN_I32, + TOKEN_I64, + TOKEN_F32, + TOKEN_F64, + + // 内置复合类型 + TOKEN_ARR, + TOKEN_VEC, + TOKEN_MAP, + TOKEN_TUP, + + // 关键字 + TOKEN_VAR, + TOKEN_TRUE, + TOKEN_FALSE, + TOKEN_TYPE, + TOKEN_STRUCT, + TOKEN_CONTINUE, + TOKEN_BREAK, + TOKEN_FOR, + TOKEN_IN, + TOKEN_IF, + TOKEN_ELSE, + TOKEN_ELSE_IF, + TOKEN_FN, + TOKEN_RETURN, + TOKEN_STMT_EOF, // ; + TOKEN_EOF,// TOKEN_EOF 一定要在最后一个,否则会索引溢出 +}; + +inline static std::unordered_map token_str = { + {TOKEN_LEFT_PAREN, "("}, + {TOKEN_RIGHT_PAREN, ")"}, + {TOKEN_LEFT_SQUARE, "["}, + {TOKEN_RIGHT_SQUARE, "]"}, + {TOKEN_LEFT_CURLY, "{"}, + {TOKEN_RIGHT_CURLY, "}"}, + {TOKEN_LEFT_ANGLE, "<"}, + {TOKEN_LESS_THAN, "<"}, + {TOKEN_RIGHT_ANGLE, ">"}, + {TOKEN_COMMA, ","}, + {TOKEN_DOT, "."}, + {TOKEN_MINUS, "-"}, + {TOKEN_PLUS, "+"}, + {TOKEN_COLON, ":"}, + {TOKEN_SEMICOLON, ";"}, + {TOKEN_SLASH, "/"}, + {TOKEN_STAR, "*"}, + {TOKEN_PERSON, "%"}, + {TOKEN_QUESTION, "?"}, + {TOKEN_RIGHT_ARROW, "->"}, + {TOKEN_NOT, "!"}, + {TOKEN_NOT_EQUAL, "!="}, + {TOKEN_EQUAL, "="}, + {TOKEN_EQUAL_EQUAL, "=="}, + {TOKEN_GREATER_EQUAL, ">="}, + {TOKEN_LESS_EQUAL, "<="}, + {TOKEN_AND_AND, "&&"}, + {TOKEN_OR_OR, "||"}, + {TOKEN_PLUS_EQUAL, "+="}, + {TOKEN_MINUS_EQUAL, "-="}, + {TOKEN_STAR_EQUAL, "*="}, + {TOKEN_SLASH_EQUAL, "/="}, + {TOKEN_PERSON_EQUAL, "%="}, + {TOKEN_AND_EQUAL, "&="}, + {TOKEN_OR_EQUAL, "|="}, + {TOKEN_XOR_EQUAL, "^="}, + {TOKEN_LEFT_SHIFT_EQUAL, "<<="}, + {TOKEN_RIGHT_SHIFT_EQUAL, ">>="}, + {TOKEN_TILDE, "~"}, + {TOKEN_AND, "&"}, + {TOKEN_OR, "|"}, + {TOKEN_XOR, "^"}, + {TOKEN_LEFT_SHIFT, "<<"}, + {TOKEN_RIGHT_SHIFT, ">>"}, + {TOKEN_IDENT, "ident_literal"}, + {TOKEN_LITERAL_STRING, "string_literal"}, + {TOKEN_LITERAL_FLOAT, "float_literal"}, + {TOKEN_LITERAL_INT, "int_literal"}, + {TOKEN_STRING, "string"}, + {TOKEN_BOOL, "bool"}, + {TOKEN_U8, "u8"}, + {TOKEN_U16, "u16"}, + {TOKEN_U32, "u32"}, + {TOKEN_U64, "u64"}, + {TOKEN_I8, "i8"}, + {TOKEN_I16, "i16"}, + {TOKEN_I32, "i32"}, + {TOKEN_I64, "i64"}, + {TOKEN_F32, "f32"}, + {TOKEN_F64, "f64"}, + {TOKEN_ARR, "arr"}, + {TOKEN_VEC, "vec"}, + {TOKEN_MAP, "map"}, + {TOKEN_TUP, "tup"}, + {TOKEN_VAR, "var"}, + {TOKEN_TRUE, "true"}, + {TOKEN_FALSE, "false"}, + {TOKEN_TYPE, "type"}, + {TOKEN_STRUCT, "struct"}, + {TOKEN_CONTINUE, "continue"}, + {TOKEN_BREAK, "break"}, + {TOKEN_FOR, "for"}, + {TOKEN_IN, "in"}, + {TOKEN_IF, "if"}, + {TOKEN_ELSE, "else"}, + {TOKEN_ELSE_IF, "else if"}, + {TOKEN_FN, "fn"}, + {TOKEN_RETURN, "return"}, + {TOKEN_STMT_EOF, ";"}, + {TOKEN_EOF, "\0"} +}; + +struct token_t { + token_type_t type; + std::string literal; + int line; + int column; + int length; + + token_t(token_type_t token_type, std::string literal, int line, int column) + : type(token_type), literal(literal), line(line), column(column), length(literal.size()) { +#ifdef DEBUG_SCANNER + std::cout << "[DEBUG] SCANNER line: " << line << ", type: " << token_str[token_type] << ", literal: " << literal << std::endl; +#endif + } +}; \ No newline at end of file diff --git a/include/types.hpp b/include/types.hpp new file mode 100644 index 0000000..e85d6fa --- /dev/null +++ b/include/types.hpp @@ -0,0 +1,37 @@ +#pragma once +#include +#include +#include "syntax/token.h" + +struct scanner_cursor_t { + std::string source; + std::string::size_type current; + std::string::size_type guard; + int length; + + int line; // 扫描器当前所在的行 + int column; // 扫描器当前所在的列 + + char space_prev; // 记录空行,注释前的上一个字符 + char space_next; +}; + +struct module_t { + std::string source; + + scanner_cursor_t s_cursor; + std::vector token_list; + + module_t(std::string source) + : source(source) { + s_cursor.source = source; + s_cursor.line = 1; + s_cursor.column = 1; + s_cursor.length = 0; + s_cursor.current = 0; + s_cursor.guard = 0; + + s_cursor.space_prev = '\0'; + s_cursor.space_next = '\0'; + } +}; \ No newline at end of file diff --git a/input.txt b/input.txt new file mode 100644 index 0000000..eb79b32 --- /dev/null +++ b/input.txt @@ -0,0 +1,52 @@ +//Merge Sort + +struct Point { + x:i8; + y:i8; +} +// Struct + +/* +ababa +*/ +[Point:105] tmp; + +fn MergeSort([Point:20] v,i8 l,i8 r) -> { + if l>r { + return ; + } + var mid = l + r >>1; + MergeSort(v,l,mid); + MergeSort(v,mid+1,r); + i8 i=l,j=mid+1,k=l; + for ;i <= mid && j <= r;k+=1 { + if v[l] < v[r] + { + tmp[k] = v[l]; + l +=1; + } + else{ + tmp[k] = v[r]; + r += 1; + } + } + for ; i<=mid; { + tmp[k] = tmp[i]; + k += 1,i+=1; + } + for ; j <=r ; { + tmp[k] = tmp[j]; + k +=1 , j += 1; + } + +} +fn main()->i8{ + [Point:20] d; + [Point] d; + + for i8 i = 0;i< 20; i++ { + d[i] = {x:i * i,y:i}; + } + MergeSort(d,d+20); + 0 +} \ No newline at end of file diff --git a/src/Scanner.cpp b/src/Scanner.cpp index fd290c6..7387059 100644 --- a/src/Scanner.cpp +++ b/src/Scanner.cpp @@ -1,6 +1,5 @@ #include "Scanner.h" #include -#include int Scanner::process_const_table(int index) { return 0; @@ -35,45 +34,7 @@ int Scanner::process_identifier_table(int index) { } int Scanner::process_key_table(int index) { - int max_len = 0; - int found_key = -1; // 存储找到的关键字编号 - - // 遍历关键字表 - for (const auto& pair : m_tables.KeyTable) { - const std::string& keyword = pair.second; - int len = keyword.length(); - - // 检查剩余长度是否足够 - if (index + len > m_source_code.length()) { - continue; - } - - // 比较子串是否匹配关键字 - if (m_source_code.substr(index, len) == keyword) { - // 检查关键字后是否紧跟字母/数字/下划线 - if (index + len < m_source_code.length()) { - char next_char = m_source_code[index + len]; - if (isalnum(next_char) || next_char == '_') { - continue; // 是标识符的一部分,跳过 - } - } - - // 更新最长匹配(解决"float32"和"float64"的冲突) - if (len > max_len) { - max_len = len; - found_key = pair.first; - } - } - } - - // 找到有效关键字 - if (max_len > 0) { - Token token({ found_key, KEY_TABLE }); - m_token_list.push_back(token); - return max_len; - } - - return 0; // 未识别到关键字 + return 0; } int Scanner::process_punct_table(int index) { diff --git a/src/Tbs.cpp b/src/Tbs.cpp deleted file mode 100644 index 728e99a..0000000 --- a/src/Tbs.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "Tbs.h" diff --git a/src/input.txt b/src/input.txt new file mode 100644 index 0000000..eb79b32 --- /dev/null +++ b/src/input.txt @@ -0,0 +1,52 @@ +//Merge Sort + +struct Point { + x:i8; + y:i8; +} +// Struct + +/* +ababa +*/ +[Point:105] tmp; + +fn MergeSort([Point:20] v,i8 l,i8 r) -> { + if l>r { + return ; + } + var mid = l + r >>1; + MergeSort(v,l,mid); + MergeSort(v,mid+1,r); + i8 i=l,j=mid+1,k=l; + for ;i <= mid && j <= r;k+=1 { + if v[l] < v[r] + { + tmp[k] = v[l]; + l +=1; + } + else{ + tmp[k] = v[r]; + r += 1; + } + } + for ; i<=mid; { + tmp[k] = tmp[i]; + k += 1,i+=1; + } + for ; j <=r ; { + tmp[k] = tmp[j]; + k +=1 , j += 1; + } + +} +fn main()->i8{ + [Point:20] d; + [Point] d; + + for i8 i = 0;i< 20; i++ { + d[i] = {x:i * i,y:i}; + } + MergeSort(d,d+20); + 0 +} \ No newline at end of file diff --git a/unit/input.txt b/unit/input.txt new file mode 100644 index 0000000..eb79b32 --- /dev/null +++ b/unit/input.txt @@ -0,0 +1,52 @@ +//Merge Sort + +struct Point { + x:i8; + y:i8; +} +// Struct + +/* +ababa +*/ +[Point:105] tmp; + +fn MergeSort([Point:20] v,i8 l,i8 r) -> { + if l>r { + return ; + } + var mid = l + r >>1; + MergeSort(v,l,mid); + MergeSort(v,mid+1,r); + i8 i=l,j=mid+1,k=l; + for ;i <= mid && j <= r;k+=1 { + if v[l] < v[r] + { + tmp[k] = v[l]; + l +=1; + } + else{ + tmp[k] = v[r]; + r += 1; + } + } + for ; i<=mid; { + tmp[k] = tmp[i]; + k += 1,i+=1; + } + for ; j <=r ; { + tmp[k] = tmp[j]; + k +=1 , j += 1; + } + +} +fn main()->i8{ + [Point:20] d; + [Point] d; + + for i8 i = 0;i< 20; i++ { + d[i] = {x:i * i,y:i}; + } + MergeSort(d,d+20); + 0 +} \ No newline at end of file diff --git a/unit/scanner_test.cpp b/unit/scanner_test.cpp index 1246e77..19055ee 100644 --- a/unit/scanner_test.cpp +++ b/unit/scanner_test.cpp @@ -1,30 +1,21 @@ -#include "Token.h" + #include "doctest.h" #include "stdc++.h" -#include "Scanner.h" -#include "Tbs.h" +#include "types.hpp" +#include "syntax/Scanner.h" +#include "syntax/token.h" #include using std::string,std::vector; -TEST_CASE("Scanner test identifier table") { - Tbs tables; - std::string src = "abcvljl laadfs fafarwrw"; - Scanner scan(src, tables); - scan.scan(); +TEST_CASE("Scanner test") { + std::ifstream t("input.txt"); + std::stringstream buffer; + buffer<