Hydrogen/include/syntax/Scanner.h

#pragma once
#include "doctest.h"
#include "token.h"
#include "../types.hpp"
#include <cctype>
#include <iostream>
#include <sstream>
#include <string>

class Scanner {
public:
    Scanner(module_t& module)
        : module(module) {}

    inline std::vector<token_t> scan() {
        std::vector<token_t> tokens;

        while (!at_eof()) {

            if (skip_space()) {
                // 如果是空格或换行，则跳过
                continue;
            }

            token_t token = item();
            tokens.push_back(token);
        }

        tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column));
        return tokens;
    }


private:
    module_t& module;

    inline std::string gen_word() {
        return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length);
    }

    inline bool is_space(char c) {
        if (c == '\n' || c == '\t' || c == '\r' || c == ' ') {
            return true;
        }
        return false;
    }


    inline bool is_string(char s) {
        return s == '"';
    }

    inline bool is_float(std::string word) {
        // 是否包含 .,包含则为 float
        int dot_count = 0;
        bool has_e = false;

        for (std::string::size_type i = 0; i < word.size(); i++) {
            if (word[i] == '.')
                dot_count++;
            else if (word[i] == 'e' || word[i] == 'E')
                has_e = true;
        }

        // 结尾不能是 .
        if (word[-1] == '.') {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". floating-point numbers cannot end with '.'";
            return false;
        }

        // 如果有科学计数法标记，则认为是浮点数
        if (has_e) {
            return true;
        }

        if (dot_count == 0) {
            return false;
        }

        if (dot_count > 1) {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". floating-point numbers have multiple '.'";
            return false;
        }

        return true;
    }

    inline bool is_alpha(char c) {
        return std::isalpha(c);
    }

    inline bool is_number(char c) {
        return std::isdigit(c);
    }

    inline bool is_hex_number(char c) {
        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
    }

    inline bool is_oct_number(char c) {
        return c >= '0' && c <= '7';
    }

    inline bool is_bin_number(char c) {
        return c == '0' || c == '1';
    }

    inline bool at_eof() {
        return module.s_cursor.source[module.s_cursor.guard] == '\0';
    }

    inline char guard_advance() {
        module.s_cursor.guard++;
        module.s_cursor.length++;
        module.s_cursor.column++;

        if (module.s_cursor.source[module.s_cursor.guard] == '\n') {
            module.s_cursor.line++;
            module.s_cursor.column = 0;
        }

        return module.s_cursor.source[module.s_cursor.guard];
    }

    inline bool match(char expected) {
        if (at_eof())
            return false;

        if (module.source[module.s_cursor.guard] != expected)
            return false;

        guard_advance();
        return true;
    }

    inline std::string ident_advance() {
        while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) ||
               is_number(module.s_cursor.source[module.s_cursor.guard])) &&
               !at_eof()) {
                guard_advance();

        }

        return gen_word();
    }

    inline token_type_t special_char() {
        char c = module.s_cursor.source[module.s_cursor.guard];
        guard_advance();
        switch (c) {
            case '(':
                return TOKEN_LEFT_PAREN;
            case ')':
                return TOKEN_RIGHT_PAREN;
            case '[':
                return TOKEN_LEFT_SQUARE;
            case ']':
                return TOKEN_RIGHT_SQUARE;
            case '{':
                return TOKEN_LEFT_CURLY;
            case '}':
                return TOKEN_RIGHT_CURLY;
            case ':':
                return TOKEN_COLON;
            case ';':
                return TOKEN_STMT_EOF;
            case ',':
                return TOKEN_COMMA;
            case '?':
                return TOKEN_QUESTION;
            case '%':
                return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON;
            case '-':
                if (match('=')) {
                    return TOKEN_MINUS_EQUAL;
                }
                if (match('>')) {
                    return TOKEN_RIGHT_ARROW;
                }

                return TOKEN_MINUS;
            case '+':
                return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS;
            case '/':
                return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH;
            case '*': {
                return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR;
            }
            case '.': {
                return TOKEN_DOT;
            }
            case '!':
                return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT;
            case '=':
                return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL;
            case '<':
                if (match('<')) {
                    // <<
                    if (match('=')) {
                        // <<=
                        return TOKEN_LEFT_SHIFT_EQUAL;
                    }
                    // <<
                    return TOKEN_LEFT_SHIFT;
                } else if (match('=')) {
                    return TOKEN_LESS_EQUAL;
                }
                return TOKEN_LEFT_ANGLE;
            case '>': {
                if (match('=')) {
                    // >=
                    return TOKEN_GREATER_EQUAL;
                }
                if (match('>') && match('=')) {
                    return TOKEN_RIGHT_SHIFT_EQUAL;
                }

                return TOKEN_RIGHT_ANGLE; // >
            }
            case '&':
                return match('&') ? TOKEN_AND_AND : TOKEN_AND;
            case '|':
                return match('|') ? TOKEN_OR_OR : TOKEN_OR;
            case '~':
                return TOKEN_TILDE;
            case '^':
                return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR;
            default:
                return token_type_t::TOKEN_NOT_IN_THIS_TYPE;
        }

    }

    inline std::string string_advance() {
        module.s_cursor.guard++;
        char escape_char = '\\';

        std::stringstream buf;

        while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) {
            char guard = module.s_cursor.source[module.s_cursor.guard];

            if (guard == '\n') {
                std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". string cannot newline.";
            }

            // 处理转义字符
            if (guard == escape_char) {
                // 跳过转义字符第一个
                module.s_cursor.guard++;

                guard = module.s_cursor.source[module.s_cursor.guard];

                switch (guard) {
                    case 'n':
                        guard = '\n';
                        break;
                    case 't':
                        guard = '\t';
                        break;
                    case 'r':
                        guard = '\r';
                        break;
                    case 'b':
                        guard = '\b';
                        break;
                    case 'f':
                        guard = '\f';
                        break;
                    case 'a':
                        guard = '\a';
                        break;
                    case 'v':
                        guard = '\v';
                        break;
                    case '0':
                        guard = '\0';
                        break;
                    case '\\':
                    case '\'':
                    case '\"':
                        break;
                    default:
                        std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                        << module.s_cursor.column <<". unknown escape char " << guard;
                }
            }

            buf << guard;
            guard_advance();
        }

        //跳过close char
        module.s_cursor.guard++;

        return buf.str();
    }

    inline long number_convert(std::string word, int base) {
        try {
            long decimal = std::stol(word, 0, base);
            return decimal;
        } catch (const std::invalid_argument& e) {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". Invalid number: " << word << std::endl;
            return 0;
        } catch (const std::out_of_range& e) {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". Number out of range: " << word << std::endl;
            return 0;
        }
    }

    inline double number_convert_float(std::string word) {
        try {
            double decimal = std::stod(word);
            return decimal;
        } catch (const std::invalid_argument& e) {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". Invalid number: " << word << std::endl;
            return 0;
        } catch (const std::out_of_range& e) {
            std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                << module.s_cursor.column <<". Number out of range: " << word << std::endl;
            return 0;
        }
    }

    inline std::string hex_number_advance() {
        module.s_cursor.guard += 2; // 跳过 0x

        while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
            guard_advance();
        }

        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
    }

    inline std::string oct_number_advance() {
        module.s_cursor.guard += 2; // 跳过 0o

        while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
            guard_advance();
        }

        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
    }

    inline std::string bin_number_advance() {
        module.s_cursor.guard += 2; // 跳过 0b

        while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
            guard_advance();
        }

        return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
    }

    inline std::string number_advance() {
        while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
            guard_advance();
        }

        // 处理小数部分
        if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) {
            guard_advance(); // 跳过小数点
            while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
                guard_advance();
            }
        }

        // 处理科学计数法
        if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E')
            && (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) ||
                module.s_cursor.source[module.s_cursor.guard + 1] == '+' ||
                module.s_cursor.source[module.s_cursor.guard + 1] == '-')) {
            guard_advance(); // 跳过 e 或 E
            if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') {
                guard_advance(); // 跳过符号
            }
            while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
                guard_advance();
            }
        }

        return gen_word();
    }

    static token_type_t scanner_ident(std::string word, int length) {
        switch (word[0]) {
            case 'a': {
                switch (word[1]) {
                    case 'r': {
                        if (word.substr(2, 3) == "ray" && word.size() == 5) {
                            return TOKEN_ARR;
                        }
                    }
                }
                break;
            }
            case 'b': {
                switch (word[1]) {
                    case 'o':
                        if (word.substr(2, 2) == "ol" && word.size() == 4) {
                            return TOKEN_BOOL;
                        }
                    case 'r':
                        if (word.substr(2, 3) == "eak" && word.size() == 5) {
                            return TOKEN_BREAK;
                        }
                }
                break;
            }
            case 'c': {
                switch (word[1]) {
                    case 'o':
                        if (word.substr(2, 6) == "ntinue" && word.size() == 8) {
                            return TOKEN_CONTINUE;
                        }
                }
                break;
            }
            case 'e': {
                if (word.substr(1, 3) == "lse" && word.size() == 4) {
                    return TOKEN_ELSE;
                }
            }
            case 'f': {
                switch (word[1]) {
                    case 'n':
                        if (word.size() == 2) {
                            return TOKEN_FN;
                        }
                    case 'a':
                        if (word.substr(2, 3) == "lse" && word.size() == 5) {
                            return TOKEN_FALSE;
                        }
                    case '3':
                        if (word.substr(2, 1) == "2" && word.size() == 3) {
                            return TOKEN_F32;
                        }
                    case '6':
                        if (word.substr(2, 1) == "4" && word.size() == 3) {
                            return TOKEN_F64;
                        }
                    case 'o':
                        if (word.substr(2, 1) == "r" && word.size() == 3) {
                            return TOKEN_FOR;
                        }
                }
                break;
            }
            case 'i': {
                switch (word[1]) {
                    case 'f':
                        if (word.size() == 2) {
                            return TOKEN_IF;
                        }
                    case '8':
                        if (word.size() == 2) {
                            return TOKEN_I8;
                        }
                    case '1':
                        if (word.substr(2, 1) == "6" && word.size() == 3) {
                            return TOKEN_I16;
                        }
                    case '3':
                        if (word.substr(2, 1) == "2" && word.size() == 3) {
                            return TOKEN_I32;
                        }
                    case '6':
                        if (word.substr(2, 1) == "4" && word.size() == 3) {
                            return TOKEN_I64;
                        }
                }
                break;
            }
            case 's': {
                // self,string,struct,sizeof,sett

                if (length == 6 && word[1] == 't' && word[2] == 'r') {
                    switch (word[3]) {
                        case 'i':
                            if (word.substr(4, 2) == "ng" && word.size() == 6) {
                                return TOKEN_STRING;
                            }
                        case 'u':
                            if (word.substr(4, 2) == "ct" && word.size() == 6) {
                                return TOKEN_STRUCT;
                            }
                    }
                }
                break;
            }
            case 't': {
                // tup/throw/type/true
                switch (word[1]) {
                    case 'y': // type
                        if (word.substr(2, 2) == "pe" && word.size() == 4) {
                            return TOKEN_TYPE;
                        }
                    case 'u': // tup
                        if (word.substr(2, 1) == "p" && word.size() == 3) {
                            return TOKEN_TUP;
                        }
                    case 'r': {
                        switch (word[2]) {
                            case 'u':
                                if (word.substr(3, 1) == "e" && word.size() == 4) {
                                    return TOKEN_TRUE;
                                }
                        }
                        break;
                    }
                }
                break;
            }
            case 'v': {
                switch (word[1]) {
                    case 'a':
                        if (word.substr(2, 1) == "r" && word.size() == 3) {
                            return TOKEN_VAR;
                        }
                    case 'e': // vec
                        if (word.substr(2, 1) == "c" && word.size() == 3) {
                            return TOKEN_VEC;
                        }
                }
            }
            case 'u': {
                switch (word[1]) {
                    case '8':
                        if (word.size() == 2) {
                            return TOKEN_U8;
                        }
                    case '1':
                        if (word.substr(2, 1) == "6" && word.size() == 3) {
                            return TOKEN_U16;
                        }
                    case '3':
                        if (word.substr(2, 1) == "2" && word.size() == 3) {
                            return TOKEN_U32;
                        }
                    case '6':
                        if (word.substr(2, 1) == "4" && word.size() == 3) {
                            return TOKEN_U64;
                        }
                }
                break;
            }
            case 'm': {
                // map
                switch (word[1]) {
                    case 'a': {
                        switch (word[2]) {
                            case 'p':
                                if (word.size() == 3) {
                                    return TOKEN_MAP;
                                }
                        }
                    }
                }
            }
            case 'r': {
                if (word.substr(1, 5) == "eturn" && word.size() == 6) {
                    // return
                    return TOKEN_RETURN;
                }
            }
        }

        return TOKEN_IDENT;
    }

    inline bool multi_comment_end() {
        return module.s_cursor.source[module.s_cursor.guard] == '*' &&
               module.s_cursor.source[module.s_cursor.guard + 1] == '/';
    }

    inline bool skip_space() {
        bool has_new = false;

        if (module.s_cursor.guard != module.s_cursor.current) {
            module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1];
        }

        while (true) {
            char c = module.s_cursor.source[module.s_cursor.guard];
            switch (c) {
                case ' ':
                case '\r':
                case '\t': {
                    guard_advance();
                    break;
                }
                case '\n': {
                    guard_advance();
                    has_new = true;
                    break;
                }
                case '/': {
                    if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') {
                        // 单行注释
                        while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) {
                            guard_advance();
                        }
                        break;
                    } else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') {
                        while (!multi_comment_end()) {
                            if (at_eof()) {
                                std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
                                    << module.s_cursor.column <<". multi comment not end.";
                                return false;
                            }
                            guard_advance();
                        }

                        guard_advance(); // 跳过 *
                        guard_advance(); // 跳过 /
                        break;
                    } else {
                        module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
                        return has_new;
                    }
                    break;
                }
                default: {
                    module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
                    return has_new;
                }
            }
        }

    }

    inline token_t item() {
        module.s_cursor.length = 0; // 重置长度
        module.s_cursor.current = module.s_cursor.guard; // 重置游标位置
        if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) {
            std::string word = ident_advance();
            return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column);
        }

        if (is_number(module.s_cursor.source[module.s_cursor.guard])) {
            std::string word;
            long decimal = 0;

            if (module.s_cursor.source[module.s_cursor.guard] == '0') {
                // 可能是十六进制、八进制或二进制
                if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' ||
                    module.s_cursor.source[module.s_cursor.guard + 1] == 'X') {
                    word = hex_number_advance();
                    decimal = number_convert(word, 16);
                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' ||
                           module.s_cursor.source[module.s_cursor.guard + 1] == 'O') {
                    word = oct_number_advance();
                    decimal = number_convert(word, 8);
                } else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' ||
                           module.s_cursor.source[module.s_cursor.guard + 1] == 'B') {
                    word = bin_number_advance();
                    decimal = number_convert(word, 2);
                }
                else {
                    word = number_advance();
                    if (word.size() > 1) word = word.substr(1, word.size() - 1);
                    decimal = number_convert(word,10);
                }
            } else {
                word = number_advance();
                decimal = number_convert(word, 10);
            }

            token_type_t type;
            if (is_float(word)) {
                type = TOKEN_LITERAL_FLOAT;
            } else {
                type = TOKEN_LITERAL_INT;
            }
            return token_t(type, word, module.s_cursor.line, module.s_cursor.column);
        }
        if (is_string(module.s_cursor.source[module.s_cursor.guard])) {
            std::string word = string_advance();
            return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column);
        }

        token_type_t type = special_char();

        return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column);
    }


};