From 0a3bfc4a0810c9255e382d11e9d7840f7853a700 Mon Sep 17 00:00:00 2001
From: Gary Gan <gan.fang.yi@foxmail.com>
Date: Wed, 4 Jun 2025 20:48:10 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A4=A7=E8=87=B4=E5=AE=9E=E7=8E=B0=E4=BA=86?=
 =?UTF-8?q?=E8=AF=8D=E6=B3=95=E5=88=86=E6=9E=90=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/syntax/Scanner.h | 321 ++++++++++++++++++++++++++++-----------
 include/types.hpp        |   6 +
 unit/scanner_test.cpp    |  29 +---
 3 files changed, 243 insertions(+), 113 deletions(-)

diff --git a/include/syntax/Scanner.h b/include/syntax/Scanner.h
index 8b9e9f8..4e80214 100644
--- a/include/syntax/Scanner.h
+++ b/include/syntax/Scanner.h
@@ -12,6 +12,24 @@ public:
     Scanner(module_t& module) 
         : module(module) {}
 
+    inline std::vector<token_t> scan() {
+        std::vector<token_t> tokens;
+
+        while (!at_eof()) {
+
+            if (skip_space()) {
+                // 如果是空格或换行，则跳过
+                continue;
+            }
+
+            token_t token = item();
+            tokens.push_back(token);
+        }
+
+        tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column));
+        return tokens;
+    }
+
     
 private:
     module_t& module;
@@ -90,7 +108,7 @@ private:
     }
 
     inline bool at_eof() {
-        return module.s_cursor.guard == '\0';
+        return module.s_cursor.source[module.s_cursor.guard] == '\0';
     }
 
     inline char guard_advance() {
@@ -127,7 +145,7 @@ private:
         return gen_word();
     }
 
-    inline token_type_t scanner_special_char(module_t *m) {
+    inline token_type_t special_char() {
         char c = guard_advance();
         switch (c) {
             case '(':
@@ -210,6 +228,7 @@ private:
             default:
                 return token_type_t::TOKEN_NOT_IN_THIS_TYPE;
         }
+
     }
 
     inline std::string string_advance() {
@@ -293,6 +312,21 @@ private:
         }
     }
 
+    inline double number_convert_float(std::string word) {
+        try {
+            double decimal = std::stod(word);
+            return decimal;
+        } catch (const std::invalid_argument& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
+                << module.s_cursor.column <<". Invalid number: " << word << std::endl;
+            return 0;
+        } catch (const std::out_of_range& e) {
+            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
+                << module.s_cursor.column <<". Number out of range: " << word << std::endl;
+            return 0;
+        }
+    }
+
     inline std::string hex_number_advance() {
         module.s_cursor.guard += 2; // 跳过 0x
 
@@ -300,7 +334,7 @@ private:
             guard_advance();
         }
 
-        return gen_word();
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
     }
 
     inline std::string oct_number_advance() {
@@ -310,7 +344,7 @@ private:
             guard_advance();
         }
 
-        return gen_word();
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
     }
 
     inline std::string bin_number_advance() {
@@ -320,7 +354,7 @@ private:
             guard_advance();
         }
 
-        return gen_word();
+        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
     }
 
     inline std::string number_advance() {
@@ -358,118 +392,103 @@ private:
             case 'a': {
                 switch (word[1]) {
                     case 'r': {
-                        if (word.substr(2, 3) == "ray") {
+                        if (word.substr(2, 3) == "ray" && word.size() == 5) {
                             return TOKEN_ARR;
                         }
                     }
                 }
                 break;
             }
-            case 'b':
+            case 'b': {
                 switch (word[1]) {
                     case 'o':
-                        if (word.substr(2, 2) == "ol") {
+                        if (word.substr(2, 2) == "ol" && word.size() == 4) {
                             return TOKEN_BOOL;
+                        }
                     case 'r':
-                        if (word.substr(2, 3) == "eak") {
+                        if (word.substr(2, 3) == "eak" && word.size() == 5) {
                             return TOKEN_BREAK;
                         }
                 }
                 break;
-            case 'c':
+            }
+            case 'c': {
                 switch (word[1]) {
                     case 'o':
-                        if (word.substr(2, 6) == "ntinue") {
+                        if (word.substr(2, 6) == "ntinue" && word.size() == 8) {
                             return TOKEN_CONTINUE;
                         }
                 }
                 break;
-            case 'e':
-                if (word.substr(1, 3) == "lse") {
-                    if (length == 3 && word[3] == 'i') {
-                        return TOKEN_ELSE_IF;
-                    }
+            }
+            case 'e': {
+                if (word.substr(1, 3) == "lse" && word.size() == 4) {
                     return TOKEN_ELSE;
                 }
-                return scanner_rest(word, length, 1, 3, "lse", TOKEN_ELSE);
+            }
             case 'f': {
                 switch (word[1]) {
                     case 'n':
-                        return scanner_rest(word, length, 2, 0, "", TOKEN_FN);
+                        if (word.size() == 2) {
+                            return TOKEN_FN;
+                        }
                     case 'a':
-                        return scanner_rest(word, length, 2, 3, "lse", TOKEN_FALSE);
-                    case 'l':
-                        return scanner_rest(word, length, 2, 3, "oat", TOKEN_FLOAT);
+                        if (word.substr(2, 3) == "lse" && word.size() == 5) {
+                            return TOKEN_FALSE;
+                        }
                     case '3':
-                        return scanner_rest(word, length, 2, 1, "2", TOKEN_F32);
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_F32;
+                        }
                     case '6':
-                        return scanner_rest(word, length, 2, 1, "4", TOKEN_F64);
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_F64;
+                        }
                     case 'o':
-                        return scanner_rest(word, length, 2, 1, "r", TOKEN_FOR);
+                        if (word.substr(2, 1) == "r" && word.size() == 3) {
+                            return TOKEN_FOR;
+                        }
                 }
                 break;
             }
-            case 'g':
-                return scanner_rest(word, length, 1, 1, "o", TOKEN_GO);
             case 'i': {
-                if (length == 2 && word[1] == 'n') {
-                    return TOKEN_IN;
-                } else if (length == 2 && word[1] == 's') {
-                    return TOKEN_IS;
-                } else if (length == 3 && word[1] == 'n' && word[2] == 't') {
-                    return TOKEN_INT;
-                }
-
                 switch (word[1]) {
-                    case 'm':
-                        return scanner_rest(word, length, 2, 4, "port", TOKEN_IMPORT);
                     case 'f':
-                        return scanner_rest(word, length, 2, 0, "", TOKEN_IF);
-                    case 'n':
-                        return scanner_rest(word, length, 2, 7, "terface", TOKEN_INTERFACE);
+                        if (word.size() == 2) {
+                            return TOKEN_IF;
+                        }
                     case '8':
-                        return scanner_rest(word, length, 2, 0, "", TOKEN_I8);
+                        if (word.size() == 2) {
+                            return TOKEN_I8;
+                        }
                     case '1':
-                        return scanner_rest(word, length, 2, 1, "6", TOKEN_I16);
+                        if (word.substr(2, 1) == "6" && word.size() == 3) {
+                            return TOKEN_I16;
+                        }
                     case '3':
-                        return scanner_rest(word, length, 2, 1, "2", TOKEN_I32);
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_I32;
+                        }
                     case '6':
-                        return scanner_rest(word, length, 2, 1, "4", TOKEN_I64);
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_I64;
+                        }
                 }
                 break;
             }
-            case 'l': {
-                return scanner_rest(word, length, 1, 2, "et", TOKEN_LET);
-            }
-            case 'n':
-                switch (word[1]) {
-                    case 'u': // null
-                        return scanner_rest(word, length, 2, 2, "ll", TOKEN_NULL);
-                        // case 'e':// new, new 识别成 ident 在 parser 采用固定语法结构时才会被识别成 new
-                        // return scanner_rest(word, length, 2, 1, "w", TOKEN_NEW);
-                }
-                break;
-            case 'p':
-                return scanner_rest(word, length, 1, 2, "tr", TOKEN_PTR);
             case 's': {
                 // self,string,struct,sizeof,sett
-                switch (word[1]) {
-                    case 'e': {
-                        switch (word[2]) {
-                            case 't':
-                                return scanner_rest(word, length, 3, 0, "", TOKEN_SET);
-                            case 'l': // select
-                                return scanner_rest(word, length, 3, 3, "ect", TOKEN_SELECT);
-                        }
-                    }
-                }
 
                 if (length == 6 && word[1] == 't' && word[2] == 'r') {
                     switch (word[3]) {
                         case 'i':
-                            return scanner_rest(word, length, 4, 2, "ng", TOKEN_STRING);
+                            if (word.substr(4, 2) == "ng" && word.size() == 6) {
+                                return TOKEN_STRING;
+                            }
                         case 'u':
-                            return scanner_rest(word, length, 4, 2, "ct", TOKEN_STRUCT);
+                            if (word.substr(4, 2) == "ct" && word.size() == 6) {
+                                return TOKEN_STRUCT;
+                            }
                     }
                 }
                 break;
@@ -477,18 +496,20 @@ private:
             case 't': {
                 // tup/throw/type/true
                 switch (word[1]) {
-                    case 'h':
-                        return scanner_rest(word, length, 2, 3, "row", TOKEN_THROW);
                     case 'y': // type
-                        return scanner_rest(word, length, 2, 2, "pe", TOKEN_TYPE);
+                        if (word.substr(2, 2) == "pe" && word.size() == 4) {
+                            return TOKEN_TYPE;
+                        }
                     case 'u': // tup
-                        return scanner_rest(word, length, 2, 1, "p", TOKEN_TUP);
+                        if (word.substr(2, 1) == "p" && word.size() == 3) {
+                            return TOKEN_TUP;
+                        }
                     case 'r': {
                         switch (word[2]) {
-                            case 'y':
-                                return scanner_rest(word, length, 3, 0, "", TOKEN_TRY);
                             case 'u':
-                                return scanner_rest(word, length, 3, 1, "e", TOKEN_TRUE);
+                                if (word.substr(3, 1) == "e" && word.size() == 4) {
+                                    return TOKEN_TRUE;
+                                }
                         }
                         break;
                     }
@@ -498,25 +519,33 @@ private:
             case 'v': {
                 switch (word[1]) {
                     case 'a':
-                        return scanner_rest(word, length, 2, 1, "r", TOKEN_VAR);
+                        if (word.substr(2, 1) == "r" && word.size() == 3) {
+                            return TOKEN_VAR;
+                        }
                     case 'e': // vec
-                        return scanner_rest(word, length, 2, 1, "c", TOKEN_VEC);
-                    case 'o': // void
-                        return scanner_rest(word, length, 2, 2, "id", TOKEN_VOID);
+                        if (word.substr(2, 1) == "c" && word.size() == 3) {
+                            return TOKEN_VEC;
+                        }
                 }
             }
             case 'u': {
                 switch (word[1]) {
-                    case 'i':
-                        return scanner_rest(word, length, 2, 2, "nt", TOKEN_UINT);
                     case '8':
-                        return scanner_rest(word, length, 2, 0, "", TOKEN_U8);
+                        if (word.size() == 2) {
+                            return TOKEN_U8;
+                        }
                     case '1':
-                        return scanner_rest(word, length, 2, 1, "6", TOKEN_U16);
+                        if (word.substr(2, 1) == "6" && word.size() == 3) {
+                            return TOKEN_U16;
+                        }
                     case '3':
-                        return scanner_rest(word, length, 2, 1, "2", TOKEN_U32);
+                        if (word.substr(2, 1) == "2" && word.size() == 3) {
+                            return TOKEN_U32;
+                        }
                     case '6':
-                        return scanner_rest(word, length, 2, 1, "4", TOKEN_U64);
+                        if (word.substr(2, 1) == "4" && word.size() == 3) {
+                            return TOKEN_U64;
+                        }
                 }
                 break;
             }
@@ -526,28 +555,136 @@ private:
                     case 'a': {
                         switch (word[2]) {
                             case 'p':
-                                return scanner_rest(word, length, 3, 0, "", TOKEN_MAP);
-                            case 't':
-                                return scanner_rest(word, length, 3, 2, "ch", TOKEN_MATCH);
+                                if (word.size() == 3) {
+                                    return TOKEN_MAP;
+                                }
                         }
                     }
                 }
             }
             case 'r': {
-                return scanner_rest(word, length, 1, 5, "eturn", TOKEN_RETURN);
+                if (word.substr(1, 5) == "eturn" && word.size() == 6) {
+                    // return
+                    return TOKEN_RETURN;
+                }
             }
         }
 
         return TOKEN_IDENT;
     }
 
+    inline bool multi_comment_end() {
+        return module.s_cursor.source[module.s_cursor.guard] == '*' && 
+               module.s_cursor.source[module.s_cursor.guard + 1] == '/';
+    }
+
+    inline bool skip_space() {
+        bool has_new = false;
+
+        if (module.s_cursor.guard != module.s_cursor.current) {
+            module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1];
+        }
+
+        while (true) {
+            char c = module.s_cursor.source[module.s_cursor.guard];
+            switch (c) {
+                case ' ':
+                case '\r':
+                case '\t': {
+                    guard_advance();
+                    break;
+                }
+                case '\n': {
+                    guard_advance();
+                    has_new = true;
+                    break;
+                }
+                case '/': {
+                    if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') {
+                        // 单行注释
+                        while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) {
+                            guard_advance();
+                        }
+                        break;
+                    } else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') {
+                        while (!multi_comment_end()) {
+                            if (at_eof()) {
+                                std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: " 
+                                    << module.s_cursor.column <<". multi comment not end.";
+                                return false;
+                            }
+                            guard_advance();
+                        }
+
+                        guard_advance(); // 跳过 *
+                        guard_advance(); // 跳过 /
+                        break;
+                    } else {
+                        module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
+                        return has_new;
+                    }
+                    break;
+                }
+                default: {
+                    module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
+                    return has_new;
+                }
+            }
+        }
+
+    }
+
     inline token_t item() {
         module.s_cursor.length = 0; // 重置长度
-        module.s_cursor.guard = module.s_cursor.current; // 重置游标位置
+        module.s_cursor.current = module.s_cursor.guard; // 重置游标位置
 
         if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) {
             std::string word = ident_advance();
-            return token_t(ident)
+            return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column);
         }
+
+        if (is_number(module.s_cursor.source[module.s_cursor.guard])) {
+            std::string word;
+            long decimal = 0;
+            
+            if (module.s_cursor.source[module.s_cursor.guard] == '0') {
+                // 可能是十六进制、八进制或二进制
+                if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' || 
+                    module.s_cursor.source[module.s_cursor.guard + 1] == 'X') {
+                    word = hex_number_advance();
+                    decimal = number_convert(word, 16);
+                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' || 
+                           module.s_cursor.source[module.s_cursor.guard + 1] == 'O') {
+                    word = oct_number_advance();
+                    decimal = number_convert(word, 8);
+                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' || 
+                           module.s_cursor.source[module.s_cursor.guard + 1] == 'B') {
+                    word = bin_number_advance();
+                    decimal = number_convert(word, 2);
+                }
+            } else {
+                word = number_advance();
+                decimal = number_convert(word, 10);
+            }
+
+            token_type_t type;
+            if (is_float(word)) {
+                type = TOKEN_LITERAL_FLOAT;
+            } else {
+                type = TOKEN_LITERAL_INT;
+            }
+            return token_t(type, word, module.s_cursor.line, module.s_cursor.column);
+        }
+        if (is_string(module.s_cursor.source[module.s_cursor.guard])) {
+            std::string word = string_advance();
+            return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column);
+        }
+
+        token_type_t type = special_char(); 
+
+        return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column);
     }
-};
\ No newline at end of file
+
+
+};
+
diff --git a/include/types.hpp b/include/types.hpp
index 185ac10..e85d6fa 100644
--- a/include/types.hpp
+++ b/include/types.hpp
@@ -11,6 +11,9 @@ struct scanner_cursor_t {
 
     int line;  // 扫描器当前所在的行
     int column;   // 扫描器当前所在的列
+
+    char space_prev;  // 记录空行，注释前的上一个字符
+    char space_next;
 };
 
 struct module_t {
@@ -27,5 +30,8 @@ struct module_t {
             s_cursor.length = 0;
             s_cursor.current = 0;
             s_cursor.guard = 0;
+
+            s_cursor.space_prev = '\0';
+            s_cursor.space_next = '\0';
     }
 };
\ No newline at end of file
diff --git a/unit/scanner_test.cpp b/unit/scanner_test.cpp
index 3715b0c..4b9e70e 100644
--- a/unit/scanner_test.cpp
+++ b/unit/scanner_test.cpp
@@ -1,30 +1,17 @@
-#include "Token.h"
+
 #include "doctest.h"
 #include "stdc++.h"
-#include "Scanner.h"
-#include "Tbs.h"
+#include "types.hpp"
+#include "syntax/Scanner.h"
+#include "syntax/token.h"
 #include <vector>
 using std::string,std::vector;
 
-TEST_CASE("Scanner test identifier table") {
-    Tbs tables;
-    std::string src = "a += b b<<=casd;";
-    Scanner scan(src, tables);
-    scan.scan();
+TEST_CASE("Scanner test") {
+    module_t module("{}");
+    Scanner scanner(module);
 
-    for (auto value_src: scan.get_token_list()) {
-        std::cout << value_src.id << " " << value_src.type << "\n";
-    }
+    scanner.scan();
 }
 
 
-// TEST_CASE("Scanner test Punct table") {
-//     Tbs tables = {};
-//     std::string src = "+=---<<=>>>===--((([]--<<<>.";
-//     Scanner scan(src, tables);
-//     scan.scan();
-//     std::cout<<"test\n";
-//     for (auto e : scan.get_token_list()) {
-//         std::cout<<e.id<<" "<<e.type<<" "<<tables.PunctTable[e.id]<<"\n";
-//     }
-// }