删除Scanner.cpp

Merge branch 'gary'
# Conflicts: # src/Scanner.cpp # unit/scanner_test.cpp
2025-06-04 21:53:51 +08:00 · 2025-06-04 21:52:50 +08:00 · 2025-06-04 21:46:05 +08:00 · 2025-06-04 20:48:10 +08:00 · 2025-06-04 18:14:24 +08:00 · 2025-06-03 21:58:32 +08:00
14 changed files with 1155 additions and 276 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -11,9 +11,7 @@ include_directories(${CMAKE_SOURCE_DIR}/include)

 file(GLOB SOURCES ${CMAKE_SOURCE_DIR}/src/*.cpp)

-add_executable(Hydrogen ${SOURCES}
-        src/scanner.cpp
-        src/Tbs.cpp)
+add_executable(Hydrogen ${SOURCES})

 file(GLOB TEST_SOURCES ${CMAKE_SOURCE_DIR}/unit/*.cpp)

--- a/include/Scanner.h
+++ b/include/Scanner.h
@ -1,58 +0,0 @@
-# pragma once
-#include "stdc++.h"
-#include "Token.h"
-#include "Tbs.h"
-#include <cstdlib>
-#include <vector>
-#include <iostream>
-
-class Scanner {
-public:
-    Scanner(std::string source_code, Tbs tables) 
-        : m_source_code(source_code), m_tables(tables) {}
-
-    
-    void scan() {
-        int len = 0;
-        for (int i = 0; i < m_source_code.size(); i++) {
-            if (len = process_const_table(i)) {
-                i += len - 1;
-                len = 0;
-            } else if (len = process_identifier_table(i)) {
-                i += len - 1;
-                len = 0;
-            } else if (len = process_key_table(i)) {
-                i += len - 1;
-                len = 0;
-            } else if (len = process_punct_table(i)) {
-                i += len - 1; 
-                len = 0;
-            } else if (m_source_code[i] == ' ' || m_source_code[i] == '\t' || m_source_code[i] == '\n') {
-                continue;
-            } else {
-                std::cerr << "Error: Tokenize" << std::endl;
-                exit(0);
-            } 
-                 
-        }
-    }
-
-    inline std::vector<Token> get_token_list() {
-        return m_token_list;
-    }
-    
-    int process_const_table(int i);
-    int process_identifier_table(int i);
-    int process_key_table(int i);
-    int process_punct_table(int i);
-
-
-private:
-    std::string m_source_code;
-    std::vector<Token> m_token_list;
-    Tbs m_tables;
-    int index;
-
-    // 记录标识符表的索引
-    int identifier_index = 0;
-};
--- a/include/Tbs.h
+++ b/include/Tbs.h
@ -1,91 +0,0 @@
-# pragma once
-#include "stdc++.h"
-#include <unordered_map>
-
-using std::unordered_map,std::string;
-class Tbs {
-public:
-    unordered_map<int,string> ConstTable;
-    unordered_map<int,string> IdTable;
-    std::unordered_map<int, std::string> KeyTable = {
-        {1, "var"},
-        {2, "i8"},
-        {3, "i16"},
-        {4, "i32"},
-        {5, "i64"},
-        {6, "u8"},
-        {7, "u16"},
-        {8, "u32"},
-        {9, "u64"},
-        {10, "float32"},
-        {11, "float64"},
-        {12, "char"},
-        {13, "for"},
-        {14, "if"},
-        {15, "else"},
-        {16, "bool"},
-        {17, "string"},
-        {18, "vector"},
-        {19, "array"},
-        {20, "struct"},
-        {21, "tuple"},
-        {22, "print"},
-        {23, "println"}
-    };
-
-    std::unordered_map<int, std::string> PunctTable = {
-        {1, "-"},
-        {2, "!"},
-        {3, "~"},
-        {4, "/"},
-        {5, "*"},
-        {6, "%"},
-        {7, "+"},
-        {8, "-"},
-        {9, "<<"},
-        {10, ">>"},
-        {11, ">"},
-        {12, ">="},
-        {13, "<"},
-        {14, "<="},
-        {15, "=="},
-        {16, "!="},
-        {17, "&"},
-        {18, "^"},
-        {19, "|"},
-        {20, "&&"},
-        {21, "||"},
-        {22, "="},
-        {23, "%="},
-        {24, "*="},
-        {25, "/="},
-        {26, "+="},
-        {27, "-="},
-        {28, "|="},
-        {29, "&="},
-        {30, "^="},
-        {31, "<<="},
-        {32, ">>="},
-        {33, "("},
-        {34, ")"},
-        {35, "<"},
-        {36, ">"},
-        {37, ","},
-        {38, "."},
-        {39, "["},
-        {40, "]"},
-        {41, "?"},
-        {42, ":"},
-        {43, "->"},
-        {44,";"}
-    };
-
-};
-
-enum Table_Type {
-    CONST_TABLE,
-    ID_TABLE,
-    KEY_TABLE,
-    PUNCT_TABLE
-};
-
--- a/include/Token.h
+++ b/include/Token.h
@ -1,8 +0,0 @@
-#pragma once
-#include "stdc++.h"
-#include "Tbs.h"
-
-struct Token{
-    int id;
-    Table_Type type;
-};
--- a/include/input.txt
+++ b/include/input.txt
@ -0,0 +1,52 @@
+//Merge Sort
+
+struct Point {
+  x:i8;
+  y:i8;
+}
+//  Struct
+
+/*
+ababa
+*/
+[Point:105] tmp;
+
+fn MergeSort([Point:20] v,i8 l,i8 r) -> {
+   if l>r {
+      return ;   
+   }
+   var mid = l + r >>1;
+   MergeSort(v,l,mid);
+   MergeSort(v,mid+1,r);
+   i8 i=l,j=mid+1,k=l;
+   for ;i <= mid && j <= r;k+=1 {
+      if v[l] < v[r]
+      {
+        tmp[k] = v[l];
+        l +=1;
+      }
+      else{
+        tmp[k] = v[r];
+        r += 1;
+      }
+   }
+   for ; i<=mid; {
+    tmp[k] = tmp[i];
+    k += 1,i+=1;
+   }
+   for ; j <=r ; {
+    tmp[k] = tmp[j];
+    k +=1 , j += 1;
+   }
+   
+}
+fn main()->i8{
+   [Point:20] d;
+   [Point] d;
+  
+   for i8 i = 0;i< 20; i++ {
+      d[i] = {x:i * i,y:i};
+   }
+   MergeSort(d,d+20);
+  0 
+}
--- a/include/syntax/Scanner.h
+++ b/include/syntax/Scanner.h
@ -0,0 +1,696 @@
+#pragma once 
+#include "doctest.h"
+#include "token.h"
+#include "../types.hpp"
+#include <cctype>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+class Scanner {
+public:
+    Scanner(module_t& module) 
+        : module(module) {}
+
+    inline std::vector<token_t> scan() {
+        std::vector<token_t> tokens;
+
+        while (!at_eof()) {
+
+            if (skip_space()) {
+                // 如果是空格或换行，则跳过
+                continue;
+            }
+
+            token_t token = item();
+            tokens.push_back(token);
+        }
+
+        tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column));
+        return tokens;
+    }
+
+    
+private:
+    module_t& module;
+
+    inline std::string gen_word() {
+        return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length);
+    }
+
+    inline bool is_space(char c) {
+        if (c == '\n' || c == '\t' || c == '\r' || c == ' ') {
+            return true;
+        }
+        return false;
+    }
+
+
+    inline bool is_string(char s) {
+        return s == '"';
+    }
+
+    inline bool is_float(std::string word) {
+        // 是否包含 .,包含则为 float
+        int dot_count = 0;
+        bool has_e = false;
+
+        for (std::string::size_type i = 0; i < word.size(); i++) {
+            if (word[i] == '.')
+                dot_count++;
+            else if (word[i] == 'e' || word[i] == 'E')
+                has_e = true;
+        }
+
+        // 结尾不能是 .
+        if (word[-1] == '.') {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                << module.s_cursor.column <<". floating-point numbers cannot end with '.'";
+            return false;
+        }
+
+        // 如果有科学计数法标记，则认为是浮点数
+        if (has_e) {
+            return true;
+        }
+
+        if (dot_count == 0) {
+            return false;
+        }
+
+        if (dot_count > 1) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                << module.s_cursor.column <<". floating-point numbers have multiple '.'";
+            return false;
+        }
+
+        return true;
+    }
+
+    inline bool is_alpha(char c) {
+        return std::isalpha(c);
+    }
+
+    inline bool is_number(char c) {
+        return std::isdigit(c);
+    }
+
+    inline bool is_hex_number(char c) {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    inline bool is_oct_number(char c) {
+        return c >= '0' && c <= '7';
+    }
+
+    inline bool is_bin_number(char c) {
+        return c == '0' || c == '1';
+    }
+
+    inline bool at_eof() {
+        return module.s_cursor.source[module.s_cursor.guard] == '\0';
+    }
+
+    inline char guard_advance() {
+        module.s_cursor.guard++;
+        module.s_cursor.length++;
+        module.s_cursor.column++;
+
+        if (module.s_cursor.source[module.s_cursor.guard] == '\n') {
+            module.s_cursor.line++;
+            module.s_cursor.column = 0;
+        }
+
+        return module.s_cursor.source[module.s_cursor.guard];
+    }
+
+    inline bool match(char expected) {
+        if (at_eof()) 
+            return false;
+
+        if (module.source[module.s_cursor.guard] != expected) 
+            return false;
+
+        guard_advance();
+        return true;
+    }
+
+    inline std::string ident_advance() {
+        while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) ||
+               is_number(module.s_cursor.source[module.s_cursor.guard])) && 
+               !at_eof()) {
+                guard_advance();
+
+        }
+
+        return gen_word();
+    }
+
+    inline token_type_t special_char() {
+        char c = module.s_cursor.source[module.s_cursor.guard];
+        guard_advance();
+        switch (c) {
+            case '(':
+                return TOKEN_LEFT_PAREN;
+            case ')':
+                return TOKEN_RIGHT_PAREN;
+            case '[':
+                return TOKEN_LEFT_SQUARE;
+            case ']':
+                return TOKEN_RIGHT_SQUARE;
+            case '{':
+                return TOKEN_LEFT_CURLY;
+            case '}':
+                return TOKEN_RIGHT_CURLY;
+            case ':':
+                return TOKEN_COLON;
+            case ';':
+                return TOKEN_STMT_EOF;
+            case ',':
+                return TOKEN_COMMA;
+            case '?':
+                return TOKEN_QUESTION;
+            case '%':
+                return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON;
+            case '-':
+                if (match('=')) {
+                    return TOKEN_MINUS_EQUAL;
+                }
+                if (match('>')) {
+                    return TOKEN_RIGHT_ARROW;
+                }
+
+                return TOKEN_MINUS;
+            case '+':
+                return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS;
+            case '/':
+                return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH;
+            case '*': {
+                return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR;
+            }
+            case '.': {
+                return TOKEN_DOT;
+            }
+            case '!':
+                return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT;
+            case '=':
+                return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL;
+            case '<':
+                if (match('<')) {
+                    // <<
+                    if (match('=')) {
+                        // <<=
+                        return TOKEN_LEFT_SHIFT_EQUAL;
+                    }
+                    // <<
+                    return TOKEN_LEFT_SHIFT;
+                } else if (match('=')) {
+                    return TOKEN_LESS_EQUAL;
+                }
+                return TOKEN_LEFT_ANGLE;
+            case '>': {
+                if (match('=')) {
+                    // >=
+                    return TOKEN_GREATER_EQUAL;
+                }
+                if (match('>') && match('=')) {
+                    return TOKEN_RIGHT_SHIFT_EQUAL;
+                }
+
+                return TOKEN_RIGHT_ANGLE; // >
+            }
+            case '&':
+                return match('&') ? TOKEN_AND_AND : TOKEN_AND;
+            case '|':
+                return match('|') ? TOKEN_OR_OR : TOKEN_OR;
+            case '~':
+                return TOKEN_TILDE;
+            case '^':
+                return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR;
+            default:
+                return token_type_t::TOKEN_NOT_IN_THIS_TYPE;
+        }
+
+    }
+
+    inline std::string string_advance() {
+        module.s_cursor.guard++;
+        char escape_char = '\\';
+        
+        std::stringstream buf;
+
+        while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) {
+            char guard = module.s_cursor.source[module.s_cursor.guard];
+
+            if (guard == '\n') {
+                std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                << module.s_cursor.column <<". string cannot newline.";
+            }
+
+            // 处理转义字符
+            if (guard == escape_char) {
+                // 跳过转义字符第一个
+                module.s_cursor.guard++;
+
+                guard = module.s_cursor.source[module.s_cursor.guard];
+
+                switch (guard) {
+                    case 'n':
+                        guard = '\n';
+                        break;
+                    case 't':
+                        guard = '\t';
+                        break;
+                    case 'r':
+                        guard = '\r';
+                        break;
+                    case 'b':
+                        guard = '\b';
+                        break;
+                    case 'f':
+                        guard = '\f';
+                        break;
+                    case 'a':
+                        guard = '\a';
+                        break;
+                    case 'v':
+                        guard = '\v';
+                        break;
+                    case '0':
+                        guard = '\0';
+                        break;
+                    case '\\':
+                    case '\'':
+                    case '\"':
+                        break;
+                    default:
+                        std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                        << module.s_cursor.column <<". unknown escape char " << guard;
+                }
+            }
+
+            buf << guard;
+            guard_advance();
+        }
+
+        //跳过close char
+        module.s_cursor.guard++;
+
+        return buf.str();
+    }
+
+    inline long number_convert(std::string word, int base) {
+        try {
+            long decimal = std::stol(word, 0, base);
+            return decimal;
+        } catch (const std::invalid_argument& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                << module.s_cursor.column <<". Invalid number: " << word << std::endl;
+            return 0;
+        } catch (const std::out_of_range& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                << module.s_cursor.column <<". Number out of range: " << word << std::endl;
+            return 0;
+        }
+    }
+
+    inline double number_convert_float(std::string word) {
+        try {
+            double decimal = std::stod(word);
+            return decimal;
+        } catch (const std::invalid_argument& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
+                << module.s_cursor.column <<". Invalid number: " << word << std::endl;
+            return 0;
+        } catch (const std::out_of_range& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
+                << module.s_cursor.column <<". Number out of range: " << word << std::endl;
+            return 0;
+        }
+    }
+
+    inline std::string hex_number_advance() {
+        module.s_cursor.guard += 2; // 跳过 0x
+
+        while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+            guard_advance();
+        }
+
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
+    }
+
+    inline std::string oct_number_advance() {
+        module.s_cursor.guard += 2; // 跳过 0o
+
+        while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+            guard_advance();
+        }
+
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
+    }
+
+    inline std::string bin_number_advance() {
+        module.s_cursor.guard += 2; // 跳过 0b
+
+        while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+            guard_advance();
+        }
+
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
+    }
+
+    inline std::string number_advance() {
+        while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+            guard_advance();
+        }
+
+        // 处理小数部分
+        if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) {
+            guard_advance(); // 跳过小数点
+            while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+                guard_advance();
+            }
+        }
+
+        // 处理科学计数法
+        if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E')
+            && (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) || 
+                module.s_cursor.source[module.s_cursor.guard + 1] == '+' || 
+                module.s_cursor.source[module.s_cursor.guard + 1] == '-')) {
+            guard_advance(); // 跳过 e 或 E
+            if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') {
+                guard_advance(); // 跳过符号
+            }
+            while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
+                guard_advance();
+            }
+        }
+
+        return gen_word();
+    }
+
+    static token_type_t scanner_ident(std::string word, int length) {
+        switch (word[0]) {
+            case 'a': {
+                switch (word[1]) {
+                    case 'r': {
+                        if (word.substr(2, 3) == "ray" && word.size() == 5) {
+                            return TOKEN_ARR;
+                        }
+                    }
+                }
+                break;
+            }
+            case 'b': {
+                switch (word[1]) {
+                    case 'o':
+                        if (word.substr(2, 2) == "ol" && word.size() == 4) {
+                            return TOKEN_BOOL;
+                        }
+                    case 'r':
+                        if (word.substr(2, 3) == "eak" && word.size() == 5) {
+                            return TOKEN_BREAK;
+                        }
+                }
+                break;
+            }
+            case 'c': {
+                switch (word[1]) {
+                    case 'o':
+                        if (word.substr(2, 6) == "ntinue" && word.size() == 8) {
+                            return TOKEN_CONTINUE;
+                        }
+                }
+                break;
+            }
+            case 'e': {
+                if (word.substr(1, 3) == "lse" && word.size() == 4) {
+                    return TOKEN_ELSE;
+                }
+            }
+            case 'f': {
+                switch (word[1]) {
+                    case 'n':
+                        if (word.size() == 2) {
+                            return TOKEN_FN;
+                        }
+                    case 'a':
+                        if (word.substr(2, 3) == "lse" && word.size() == 5) {
+                            return TOKEN_FALSE;
+                        }
+                    case '3':
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_F32;
+                        }
+                    case '6':
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_F64;
+                        }
+                    case 'o':
+                        if (word.substr(2, 1) == "r" && word.size() == 3) {
+                            return TOKEN_FOR;
+                        }
+                }
+                break;
+            }
+            case 'i': {
+                switch (word[1]) {
+                    case 'f':
+                        if (word.size() == 2) {
+                            return TOKEN_IF;
+                        }
+                    case '8':
+                        if (word.size() == 2) {
+                            return TOKEN_I8;
+                        }
+                    case '1':
+                        if (word.substr(2, 1) == "6" && word.size() == 3) {
+                            return TOKEN_I16;
+                        }
+                    case '3':
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_I32;
+                        }
+                    case '6':
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_I64;
+                        }
+                }
+                break;
+            }
+            case 's': {
+                // self,string,struct,sizeof,sett
+
+                if (length == 6 && word[1] == 't' && word[2] == 'r') {
+                    switch (word[3]) {
+                        case 'i':
+                            if (word.substr(4, 2) == "ng" && word.size() == 6) {
+                                return TOKEN_STRING;
+                            }
+                        case 'u':
+                            if (word.substr(4, 2) == "ct" && word.size() == 6) {
+                                return TOKEN_STRUCT;
+                            }
+                    }
+                }
+                break;
+            }
+            case 't': {
+                // tup/throw/type/true
+                switch (word[1]) {
+                    case 'y': // type
+                        if (word.substr(2, 2) == "pe" && word.size() == 4) {
+                            return TOKEN_TYPE;
+                        }
+                    case 'u': // tup
+                        if (word.substr(2, 1) == "p" && word.size() == 3) {
+                            return TOKEN_TUP;
+                        }
+                    case 'r': {
+                        switch (word[2]) {
+                            case 'u':
+                                if (word.substr(3, 1) == "e" && word.size() == 4) {
+                                    return TOKEN_TRUE;
+                                }
+                        }
+                        break;
+                    }
+                }
+                break;
+            }
+            case 'v': {
+                switch (word[1]) {
+                    case 'a':
+                        if (word.substr(2, 1) == "r" && word.size() == 3) {
+                            return TOKEN_VAR;
+                        }
+                    case 'e': // vec
+                        if (word.substr(2, 1) == "c" && word.size() == 3) {
+                            return TOKEN_VEC;
+                        }
+                }
+            }
+            case 'u': {
+                switch (word[1]) {
+                    case '8':
+                        if (word.size() == 2) {
+                            return TOKEN_U8;
+                        }
+                    case '1':
+                        if (word.substr(2, 1) == "6" && word.size() == 3) {
+                            return TOKEN_U16;
+                        }
+                    case '3':
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_U32;
+                        }
+                    case '6':
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_U64;
+                        }
+                }
+                break;
+            }
+            case 'm': {
+                // map
+                switch (word[1]) {
+                    case 'a': {
+                        switch (word[2]) {
+                            case 'p':
+                                if (word.size() == 3) {
+                                    return TOKEN_MAP;
+                                }
+                        }
+                    }
+                }
+            }
+            case 'r': {
+                if (word.substr(1, 5) == "eturn" && word.size() == 6) {
+                    // return
+                    return TOKEN_RETURN;
+                }
+            }
+        }
+
+        return TOKEN_IDENT;
+    }
+
+    inline bool multi_comment_end() {
+        return module.s_cursor.source[module.s_cursor.guard] == '*' && 
+               module.s_cursor.source[module.s_cursor.guard + 1] == '/';
+    }
+
+    inline bool skip_space() {
+        bool has_new = false;
+
+        if (module.s_cursor.guard != module.s_cursor.current) {
+            module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1];
+        }
+
+        while (true) {
+            char c = module.s_cursor.source[module.s_cursor.guard];
+            switch (c) {
+                case ' ':
+                case '\r':
+                case '\t': {
+                    guard_advance();
+                    break;
+                }
+                case '\n': {
+                    guard_advance();
+                    has_new = true;
+                    break;
+                }
+                case '/': {
+                    if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') {
+                        // 单行注释
+                        while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) {
+                            guard_advance();
+                        }
+                        break;
+                    } else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') {
+                        while (!multi_comment_end()) {
+                            if (at_eof()) {
+                                std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                                    << module.s_cursor.column <<". multi comment not end.";
+                                return false;
+                            }
+                            guard_advance();
+                        }
+
+                        guard_advance(); // 跳过 *
+                        guard_advance(); // 跳过 /
+                        break;
+                    } else {
+                        module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
+                        return has_new;
+                    }
+                    break;
+                }
+                default: {
+                    module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
+                    return has_new;
+                }
+            }
+        }
+
+    }
+
+    inline token_t item() {
+        module.s_cursor.length = 0; // 重置长度
+        module.s_cursor.current = module.s_cursor.guard; // 重置游标位置
+        if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) {
+            std::string word = ident_advance();
+            return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column);
+        }
+
+        if (is_number(module.s_cursor.source[module.s_cursor.guard])) {
+            std::string word;
+            long decimal = 0;
+            
+            if (module.s_cursor.source[module.s_cursor.guard] == '0') {
+                // 可能是十六进制、八进制或二进制
+                if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' || 
+                    module.s_cursor.source[module.s_cursor.guard + 1] == 'X') {
+                    word = hex_number_advance();
+                    decimal = number_convert(word, 16);
+                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' || 
+                           module.s_cursor.source[module.s_cursor.guard + 1] == 'O') {
+                    word = oct_number_advance();
+                    decimal = number_convert(word, 8);
+                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' || 
+                           module.s_cursor.source[module.s_cursor.guard + 1] == 'B') {
+                    word = bin_number_advance();
+                    decimal = number_convert(word, 2);
+                }
+                else {
+                    word = number_advance();
+                    if (word.size() > 1) word = word.substr(1, word.size() - 1);
+                    decimal = number_convert(word,10);
+                }
+            } else {
+                word = number_advance();
+                decimal = number_convert(word, 10);
+            }
+
+            token_type_t type;
+            if (is_float(word)) {
+                type = TOKEN_LITERAL_FLOAT;
+            } else {
+                type = TOKEN_LITERAL_INT;
+            }
+            return token_t(type, word, module.s_cursor.line, module.s_cursor.column);
+        }
+        if (is_string(module.s_cursor.source[module.s_cursor.guard])) {
+            std::string word = string_advance();
+            return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column);
+        }
+
+        token_type_t type = special_char();
+
+        return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column);
+    }
+
+
+};
+
--- a/include/syntax/token.h
+++ b/include/syntax/token.h
@ -0,0 +1,201 @@
+#pragma once 
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+#define DEBUG_SCANNER
+
+enum token_type_t {
+    TOKEN_NOT_IN_THIS_TYPE = 0,
+    TOKEN_LEFT_PAREN,
+    TOKEN_RIGHT_PAREN,// ()
+    TOKEN_LEFT_SQUARE,
+    TOKEN_RIGHT_SQUARE,// []
+    TOKEN_LEFT_CURLY,
+    TOKEN_RIGHT_CURLY,// {}
+    TOKEN_LEFT_ANGLE, // <
+    TOKEN_LESS_THAN,  // <
+    TOKEN_RIGHT_ANGLE,// >
+
+    TOKEN_COMMA,      // ,
+    TOKEN_DOT,        // .
+    TOKEN_MINUS,      // -
+    TOKEN_PLUS,       // +
+    TOKEN_COLON,      // :
+    TOKEN_SEMICOLON,  // ;
+    TOKEN_SLASH,      // /
+    TOKEN_STAR,       //  a * b, *a
+    TOKEN_PERSON,     // %
+    TOKEN_QUESTION,   // ?
+    TOKEN_RIGHT_ARROW,// ->
+
+    TOKEN_NOT,// !
+    TOKEN_NOT_EQUAL,
+    TOKEN_EQUAL,
+    TOKEN_EQUAL_EQUAL,
+    TOKEN_GREATER_EQUAL,// >=
+    TOKEN_LESS_EQUAL,   // <=
+    TOKEN_AND_AND,      // &&
+    TOKEN_OR_OR,        // ||
+
+    TOKEN_PLUS_EQUAL,       // +=
+    TOKEN_MINUS_EQUAL,      // -=
+    TOKEN_STAR_EQUAL,       // *=
+    TOKEN_SLASH_EQUAL,      // /=
+    TOKEN_PERSON_EQUAL,     // %=
+    TOKEN_AND_EQUAL,        // &=
+    TOKEN_OR_EQUAL,         // |=
+    TOKEN_XOR_EQUAL,        // ^=
+    TOKEN_LEFT_SHIFT_EQUAL, // <<=
+    TOKEN_RIGHT_SHIFT_EQUAL,// >>=
+
+    // 位运算
+    TOKEN_TILDE,      // ~
+    TOKEN_AND,        // &
+    TOKEN_OR,         // |
+    TOKEN_XOR,        // ^
+    TOKEN_LEFT_SHIFT, // <<
+    TOKEN_RIGHT_SHIFT,// >>
+
+    // 字面量
+    TOKEN_IDENT,      // 标识符
+    TOKEN_LITERAL_STRING,
+    TOKEN_LITERAL_FLOAT,
+    TOKEN_LITERAL_INT,
+
+    // 类型
+    TOKEN_STRING,
+    TOKEN_BOOL,
+    TOKEN_U8,
+    TOKEN_U16,
+    TOKEN_U32,
+    TOKEN_U64,
+    TOKEN_I8,
+    TOKEN_I16,
+    TOKEN_I32,
+    TOKEN_I64,
+    TOKEN_F32,
+    TOKEN_F64,
+
+    // 内置复合类型
+    TOKEN_ARR,
+    TOKEN_VEC,
+    TOKEN_MAP,
+    TOKEN_TUP,
+
+    // 关键字
+    TOKEN_VAR,
+    TOKEN_TRUE,
+    TOKEN_FALSE,
+    TOKEN_TYPE,
+    TOKEN_STRUCT,
+    TOKEN_CONTINUE,
+    TOKEN_BREAK,
+    TOKEN_FOR,
+    TOKEN_IN,
+    TOKEN_IF,
+    TOKEN_ELSE,
+    TOKEN_ELSE_IF,
+    TOKEN_FN,
+    TOKEN_RETURN,
+    TOKEN_STMT_EOF, // ;
+    TOKEN_EOF,// TOKEN_EOF 一定要在最后一个，否则会索引溢出
+};
+
+inline static std::unordered_map<token_type_t, std::string> token_str = {
+    {TOKEN_LEFT_PAREN, "("},
+    {TOKEN_RIGHT_PAREN, ")"},
+    {TOKEN_LEFT_SQUARE, "["},
+    {TOKEN_RIGHT_SQUARE, "]"},
+    {TOKEN_LEFT_CURLY, "{"},
+    {TOKEN_RIGHT_CURLY, "}"},
+    {TOKEN_LEFT_ANGLE, "<"},
+    {TOKEN_LESS_THAN, "<"},
+    {TOKEN_RIGHT_ANGLE, ">"},
+    {TOKEN_COMMA, ","},
+    {TOKEN_DOT, "."},
+    {TOKEN_MINUS, "-"},
+    {TOKEN_PLUS, "+"},
+    {TOKEN_COLON, ":"},
+    {TOKEN_SEMICOLON, ";"},
+    {TOKEN_SLASH, "/"},
+    {TOKEN_STAR, "*"},
+    {TOKEN_PERSON, "%"},
+    {TOKEN_QUESTION, "?"},
+    {TOKEN_RIGHT_ARROW, "->"},
+    {TOKEN_NOT, "!"},
+    {TOKEN_NOT_EQUAL, "!="},
+    {TOKEN_EQUAL, "="},
+    {TOKEN_EQUAL_EQUAL, "=="},
+    {TOKEN_GREATER_EQUAL, ">="},
+    {TOKEN_LESS_EQUAL, "<="},
+    {TOKEN_AND_AND, "&&"},
+    {TOKEN_OR_OR, "||"},
+    {TOKEN_PLUS_EQUAL, "+="},
+    {TOKEN_MINUS_EQUAL, "-="},
+    {TOKEN_STAR_EQUAL, "*="},
+    {TOKEN_SLASH_EQUAL, "/="},
+    {TOKEN_PERSON_EQUAL, "%="},
+    {TOKEN_AND_EQUAL, "&="},
+    {TOKEN_OR_EQUAL, "|="},
+    {TOKEN_XOR_EQUAL, "^="},
+    {TOKEN_LEFT_SHIFT_EQUAL, "<<="},
+    {TOKEN_RIGHT_SHIFT_EQUAL, ">>="},
+    {TOKEN_TILDE, "~"},
+    {TOKEN_AND, "&"},
+    {TOKEN_OR, "|"},
+    {TOKEN_XOR, "^"},
+    {TOKEN_LEFT_SHIFT, "<<"},
+    {TOKEN_RIGHT_SHIFT, ">>"},
+    {TOKEN_IDENT, "ident_literal"},
+    {TOKEN_LITERAL_STRING, "string_literal"},
+    {TOKEN_LITERAL_FLOAT, "float_literal"},
+    {TOKEN_LITERAL_INT, "int_literal"},
+    {TOKEN_STRING, "string"},
+    {TOKEN_BOOL, "bool"},
+    {TOKEN_U8, "u8"},
+    {TOKEN_U16, "u16"},
+    {TOKEN_U32, "u32"},
+    {TOKEN_U64, "u64"},
+    {TOKEN_I8, "i8"},
+    {TOKEN_I16, "i16"},
+    {TOKEN_I32, "i32"},
+    {TOKEN_I64, "i64"},
+    {TOKEN_F32, "f32"},
+    {TOKEN_F64, "f64"},
+    {TOKEN_ARR, "arr"},
+    {TOKEN_VEC, "vec"},
+    {TOKEN_MAP, "map"},
+    {TOKEN_TUP, "tup"},
+    {TOKEN_VAR, "var"},
+    {TOKEN_TRUE, "true"},
+    {TOKEN_FALSE, "false"},
+    {TOKEN_TYPE, "type"},
+    {TOKEN_STRUCT, "struct"},
+    {TOKEN_CONTINUE, "continue"},
+    {TOKEN_BREAK, "break"},
+    {TOKEN_FOR, "for"},
+    {TOKEN_IN, "in"},
+    {TOKEN_IF, "if"},
+    {TOKEN_ELSE, "else"},
+    {TOKEN_ELSE_IF, "else if"},
+    {TOKEN_FN, "fn"},
+    {TOKEN_RETURN, "return"},
+    {TOKEN_STMT_EOF, ";"},
+    {TOKEN_EOF, "\0"}
+};
+
+struct token_t {
+    token_type_t type;
+    std::string literal;
+    int line;
+    int column;
+    int length;
+
+    token_t(token_type_t token_type, std::string literal, int line, int column) 
+        : type(token_type), literal(literal), line(line), column(column), length(literal.size()) {
+#ifdef DEBUG_SCANNER
+            std::cout << "[DEBUG] SCANNER line: " << line << ", type: " << token_str[token_type] << ", literal: " << literal << std::endl;
+#endif
+        }
+};
--- a/include/types.hpp
+++ b/include/types.hpp
@ -0,0 +1,37 @@
+#pragma once
+#include <string>
+#include <vector>
+#include "syntax/token.h"
+
+struct scanner_cursor_t {
+    std::string source;
+    std::string::size_type current;
+    std::string::size_type guard;
+    int length;
+
+    int line;  // 扫描器当前所在的行
+    int column;   // 扫描器当前所在的列
+
+    char space_prev;  // 记录空行，注释前的上一个字符
+    char space_next;
+};
+
+struct module_t {
+    std::string source;
+
+    scanner_cursor_t s_cursor;
+    std::vector<token_t> token_list;
+
+    module_t(std::string source)
+        : source(source) {
+            s_cursor.source = source;
+            s_cursor.line = 1;
+            s_cursor.column = 1;
+            s_cursor.length = 0;
+            s_cursor.current = 0;
+            s_cursor.guard = 0;
+
+            s_cursor.space_prev = '\0';
+            s_cursor.space_next = '\0';
+    }
+};
--- a/input.txt
+++ b/input.txt
@ -0,0 +1,52 @@
+//Merge Sort
+
+struct Point {
+  x:i8;
+  y:i8;
+}
+//  Struct
+
+/*
+ababa
+*/
+[Point:105] tmp;
+
+fn MergeSort([Point:20] v,i8 l,i8 r) -> {
+   if l>r {
+      return ;   
+   }
+   var mid = l + r >>1;
+   MergeSort(v,l,mid);
+   MergeSort(v,mid+1,r);
+   i8 i=l,j=mid+1,k=l;
+   for ;i <= mid && j <= r;k+=1 {
+      if v[l] < v[r]
+      {
+        tmp[k] = v[l];
+        l +=1;
+      }
+      else{
+        tmp[k] = v[r];
+        r += 1;
+      }
+   }
+   for ; i<=mid; {
+    tmp[k] = tmp[i];
+    k += 1,i+=1;
+   }
+   for ; j <=r ; {
+    tmp[k] = tmp[j];
+    k +=1 , j += 1;
+   }
+   
+}
+fn main()->i8{
+   [Point:20] d;
+   [Point] d;
+  
+   for i8 i = 0;i< 20; i++ {
+      d[i] = {x:i * i,y:i};
+   }
+   MergeSort(d,d+20);
+  0 
+}
--- a/src/Scanner.cpp
+++ b/src/Scanner.cpp
@ -1,94 +0,0 @@
-#include "Scanner.h"
-#include <sstream>
-
-int Scanner::process_const_table(int index) {
-    return 0;
-}
-
-int Scanner::process_identifier_table(int index) {
-    std::stringstream buffer;
-    int old_index = index;
-    if (std::isalpha(m_source_code[index])) {
-        buffer << m_source_code[index];
-        index += 1;
-        while(std::isalnum(m_source_code[index])) {
-            buffer << m_source_code[index];
-            index += 1;
-        }
-
-        std::string identifier = buffer.str();
-        for (const auto& key : m_tables.KeyTable) {
-            if (identifier == key.second) {
-                return 0;
-            }
-        }
-
-        m_tables.IdTable.insert({identifier_index, identifier});
-        m_token_list.push_back(Token{identifier_index, ID_TABLE});
-        identifier_index++;
-        return index - old_index;
-    } else {
-        return 0;
-    }
-
-}
-
-int Scanner::process_key_table(int index) {
-    return 0;
-}
-
-int Scanner::process_punct_table(int index) {
-    //identify the Punct in map
-    string s;
-    int n = this->m_source_code.size();
-    char c1= this->m_source_code[index];
-    char c2 = '@';
-    char c3 = '@';
-    if (index + 1 < n) c2 = this->m_source_code[index + 1];
-    if (index + 2 < n) c3 = this->m_source_code[index + 2];
-    if (((c1 == c2 && c2 == '<') || (c1 == c2&& c2 == '>'))&& c3 =='=') {
-        this->m_token_list.push_back({c1=='<'?31:32,PUNCT_TABLE});
-        return 3;
-    }
-    if (c1 == c2 ) {
-        if (c1 =='=') {
-            this->m_token_list.push_back({15,PUNCT_TABLE});
-            return 2;
-        }
-        if (c1 =='|') {
-            this->m_token_list.push_back({21,PUNCT_TABLE});
-            return 2;
-        }
-        if (c1=='&') {
-            this->m_token_list.push_back({20,PUNCT_TABLE});
-            return 2;
-        }
-        if (c1=='<') {
-            this->m_token_list.push_back({9,PUNCT_TABLE});
-            return 2;
-        }
-        if (c1=='>') {
-            this->m_token_list.push_back({10,PUNCT_TABLE});
-            return 2;
-        }
-    }
-
-    string t;
-    t.push_back(c1);
-    t.push_back(c2);
-    for (auto e : this->m_tables.PunctTable) {
-        if (e.second == t) {
-            this->m_token_list.push_back({e.first,PUNCT_TABLE});
-            return 2;
-        }
-    }
-
-    t.pop_back();
-    for (auto e : this->m_tables.PunctTable) {
-        if (e.second == t) {
-            this->m_token_list.push_back({e.first,PUNCT_TABLE});
-            return 1;
-        }
-    }
-    return 0;
-}
--- a/src/Tbs.cpp
+++ b/src/Tbs.cpp
@ -1 +0,0 @@
-#include "Tbs.h"
--- a/src/input.txt
+++ b/src/input.txt
@ -0,0 +1,52 @@
+//Merge Sort
+
+struct Point {
+  x:i8;
+  y:i8;
+}
+//  Struct
+
+/*
+ababa
+*/
+[Point:105] tmp;
+
+fn MergeSort([Point:20] v,i8 l,i8 r) -> {
+   if l>r {
+      return ;   
+   }
+   var mid = l + r >>1;
+   MergeSort(v,l,mid);
+   MergeSort(v,mid+1,r);
+   i8 i=l,j=mid+1,k=l;
+   for ;i <= mid && j <= r;k+=1 {
+      if v[l] < v[r]
+      {
+        tmp[k] = v[l];
+        l +=1;
+      }
+      else{
+        tmp[k] = v[r];
+        r += 1;
+      }
+   }
+   for ; i<=mid; {
+    tmp[k] = tmp[i];
+    k += 1,i+=1;
+   }
+   for ; j <=r ; {
+    tmp[k] = tmp[j];
+    k +=1 , j += 1;
+   }
+   
+}
+fn main()->i8{
+   [Point:20] d;
+   [Point] d;
+  
+   for i8 i = 0;i< 20; i++ {
+      d[i] = {x:i * i,y:i};
+   }
+   MergeSort(d,d+20);
+  0 
+}
--- a/unit/input.txt
+++ b/unit/input.txt
@ -0,0 +1,52 @@
+//Merge Sort
+
+struct Point {
+  x:i8;
+  y:i8;
+}
+//  Struct
+
+/*
+ababa
+*/
+[Point:105] tmp;
+
+fn MergeSort([Point:20] v,i8 l,i8 r) -> {
+   if l>r {
+      return ;   
+   }
+   var mid = l + r >>1;
+   MergeSort(v,l,mid);
+   MergeSort(v,mid+1,r);
+   i8 i=l,j=mid+1,k=l;
+   for ;i <= mid && j <= r;k+=1 {
+      if v[l] < v[r]
+      {
+        tmp[k] = v[l];
+        l +=1;
+      }
+      else{
+        tmp[k] = v[r];
+        r += 1;
+      }
+   }
+   for ; i<=mid; {
+    tmp[k] = tmp[i];
+    k += 1,i+=1;
+   }
+   for ; j <=r ; {
+    tmp[k] = tmp[j];
+    k +=1 , j += 1;
+   }
+   
+}
+fn main()->i8{
+   [Point:20] d;
+   [Point] d;
+  
+   for i8 i = 0;i< 20; i++ {
+      d[i] = {x:i * i,y:i};
+   }
+   MergeSort(d,d+20);
+  0 
+}
--- a/unit/scanner_test.cpp
+++ b/unit/scanner_test.cpp
@ -1,30 +1,21 @@
-#include "Token.h"
+
 #include "doctest.h"
 #include "stdc++.h"
-#include "Scanner.h"
-#include "Tbs.h"
+#include "types.hpp"
+#include "syntax/Scanner.h"
+#include "syntax/token.h"
 #include <vector>
 using std::string,std::vector;

-TEST_CASE("Scanner test identifier table") {
-    Tbs tables;
-    std::string src = "a += b b<<=casd;";
-    Scanner scan(src, tables);
-    scan.scan();
+TEST_CASE("Scanner test") {
+    std::ifstream t("input.txt");
+    std::stringstream buffer;
+    buffer<<t.rdbuf();
+    std::cout<<buffer.str();
+    module_t module(buffer.str());
+    Scanner scanner(module);

-    for (auto value_src: scan.get_token_list()) {
-        std::cout << value_src.id << " " << value_src.type << "\n";
-    }
+    scanner.scan();
 }


-// TEST_CASE("Scanner test Punct table") {
-//     Tbs tables = {};
-//     std::string src = "+=---<<=>>>===--((([]--<<<>.";
-//     Scanner scan(src, tables);
-//     scan.scan();
-//     std::cout<<"test\n";
-//     for (auto e : scan.get_token_list()) {
-//         std::cout<<e.id<<" "<<e.type<<" "<<tables.PunctTable[e.id]<<"\n";
-//     }
-// }
Author	SHA1	Message	Date
Gary Gan	61f5dde8c9	删除Scanner.cpp	2025-06-04 21:53:51 +08:00
Gary Gan	b6d6df0894	Merge branch 'gary' # Conflicts: # src/Scanner.cpp # unit/scanner_test.cpp	2025-06-04 21:52:50 +08:00
Guan Inf	3ea3dfde27	成功测试identifier和Punctutation , 增加界符;	2025-06-04 21:46:05 +08:00
Gary Gan	0a3bfc4a08	大致实现了词法分析器	2025-06-04 20:48:10 +08:00
Gary Gan	25236a4901	尝试重构一下代码	2025-06-04 18:14:24 +08:00
Gary Gan	0bcec890a0	Merge remote-tracking branch 'origin/main' # Conflicts: # unit/scanner_test.cpp	2025-06-03 21:58:32 +08:00
Gary Gan	0583348986	实现int Scanner::process_key_table(int index)	2025-06-03 21:58:08 +08:00