Hydrogen/include/syntax/Scanner.h

553 lines
18 KiB
C++

#pragma once
#include "doctest.h"
#include "token.h"
#include "../types.hpp"
#include <cctype>
#include <iostream>
#include <sstream>
#include <string>
class Scanner {
public:
Scanner(module_t& module)
: module(module) {}
private:
module_t& module;
inline std::string gen_word() {
return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length);
}
inline bool is_space(char c) {
if (c == '\n' || c == '\t' || c == '\r' || c == ' ') {
return true;
}
return false;
}
inline bool is_string(char s) {
return s == '"';
}
inline bool is_float(std::string word) {
// 是否包含 .,包含则为 float
int dot_count = 0;
bool has_e = false;
for (std::string::size_type i = 0; i < word.size(); i++) {
if (word[i] == '.')
dot_count++;
else if (word[i] == 'e' || word[i] == 'E')
has_e = true;
}
// 结尾不能是 .
if (word[-1] == '.') {
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". floating-point numbers cannot end with '.'";
return false;
}
// 如果有科学计数法标记,则认为是浮点数
if (has_e) {
return true;
}
if (dot_count == 0) {
return false;
}
if (dot_count > 1) {
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". floating-point numbers have multiple '.'";
return false;
}
return true;
}
inline bool is_alpha(char c) {
return std::isalpha(c);
}
inline bool is_number(char c) {
return std::isdigit(c);
}
inline bool is_hex_number(char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
inline bool is_oct_number(char c) {
return c >= '0' && c <= '7';
}
inline bool is_bin_number(char c) {
return c == '0' || c == '1';
}
inline bool at_eof() {
return module.s_cursor.guard == '\0';
}
inline char guard_advance() {
module.s_cursor.guard++;
module.s_cursor.length++;
module.s_cursor.column++;
if (module.s_cursor.source[module.s_cursor.guard] == '\n') {
module.s_cursor.line++;
module.s_cursor.column = 0;
}
return module.s_cursor.source[module.s_cursor.guard];
}
inline bool match(char expected) {
if (at_eof())
return false;
if (module.source[module.s_cursor.guard] != expected)
return false;
guard_advance();
return true;
}
inline std::string ident_advance() {
while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) ||
is_number(module.s_cursor.source[module.s_cursor.guard])) &&
!at_eof()) {
guard_advance();
}
return gen_word();
}
inline token_type_t scanner_special_char(module_t *m) {
char c = guard_advance();
switch (c) {
case '(':
return TOKEN_LEFT_PAREN;
case ')':
return TOKEN_RIGHT_PAREN;
case '[':
return TOKEN_LEFT_SQUARE;
case ']':
return TOKEN_RIGHT_SQUARE;
case '{':
return TOKEN_LEFT_CURLY;
case '}':
return TOKEN_RIGHT_CURLY;
case ':':
return TOKEN_COLON;
case ';':
return TOKEN_STMT_EOF;
case ',':
return TOKEN_COMMA;
case '?':
return TOKEN_QUESTION;
case '%':
return match('=') ? TOKEN_PERSON_EQUAL : TOKEN_PERSON;
case '-':
if (match('=')) {
return TOKEN_MINUS_EQUAL;
}
if (match('>')) {
return TOKEN_RIGHT_ARROW;
}
return TOKEN_MINUS;
case '+':
return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS;
case '/':
return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH;
case '*': {
return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR;
}
case '.': {
return TOKEN_DOT;
}
case '!':
return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT;
case '=':
return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL;
case '<':
if (match('<')) {
// <<
if (match('=')) {
// <<=
return TOKEN_LEFT_SHIFT_EQUAL;
}
// <<
return TOKEN_LEFT_SHIFT;
} else if (match('=')) {
return TOKEN_LESS_EQUAL;
}
return TOKEN_LEFT_ANGLE;
case '>': {
if (match('=')) {
// >=
return TOKEN_GREATER_EQUAL;
}
if (match('>') && match('=')) {
return TOKEN_RIGHT_SHIFT_EQUAL;
}
return TOKEN_RIGHT_ANGLE; // >
}
case '&':
return match('&') ? TOKEN_AND_AND : TOKEN_AND;
case '|':
return match('|') ? TOKEN_OR_OR : TOKEN_OR;
case '~':
return TOKEN_TILDE;
case '^':
return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR;
default:
return token_type_t::TOKEN_NOT_IN_THIS_TYPE;
}
}
inline std::string string_advance() {
module.s_cursor.guard++;
char escape_char = '\\';
std::stringstream buf;
while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) {
char guard = module.s_cursor.source[module.s_cursor.guard];
if (guard == '\n') {
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". string cannot newline.";
}
// 处理转义字符
if (guard == escape_char) {
// 跳过转义字符第一个
module.s_cursor.guard++;
guard = module.s_cursor.source[module.s_cursor.guard];
switch (guard) {
case 'n':
guard = '\n';
break;
case 't':
guard = '\t';
break;
case 'r':
guard = '\r';
break;
case 'b':
guard = '\b';
break;
case 'f':
guard = '\f';
break;
case 'a':
guard = '\a';
break;
case 'v':
guard = '\v';
break;
case '0':
guard = '\0';
break;
case '\\':
case '\'':
case '\"':
break;
default:
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". unknown escape char " << guard;
}
}
buf << guard;
guard_advance();
}
//跳过close char
module.s_cursor.guard++;
return buf.str();
}
inline long number_convert(std::string word, int base) {
try {
long decimal = std::stol(word, 0, base);
return decimal;
} catch (const std::invalid_argument& e) {
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". Invalid number: " << word << std::endl;
return 0;
} catch (const std::out_of_range& e) {
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
<< module.s_cursor.column <<". Number out of range: " << word << std::endl;
return 0;
}
}
inline std::string hex_number_advance() {
module.s_cursor.guard += 2; // 跳过 0x
while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
return gen_word();
}
inline std::string oct_number_advance() {
module.s_cursor.guard += 2; // 跳过 0o
while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
return gen_word();
}
inline std::string bin_number_advance() {
module.s_cursor.guard += 2; // 跳过 0b
while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
return gen_word();
}
inline std::string number_advance() {
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
// 处理小数部分
if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) {
guard_advance(); // 跳过小数点
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
}
// 处理科学计数法
if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E')
&& (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) ||
module.s_cursor.source[module.s_cursor.guard + 1] == '+' ||
module.s_cursor.source[module.s_cursor.guard + 1] == '-')) {
guard_advance(); // 跳过 e 或 E
if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') {
guard_advance(); // 跳过符号
}
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
guard_advance();
}
}
return gen_word();
}
static token_type_t scanner_ident(std::string word, int length) {
switch (word[0]) {
case 'a': {
switch (word[1]) {
case 'r': {
if (word.substr(2, 3) == "ray") {
return TOKEN_ARR;
}
}
}
break;
}
case 'b':
switch (word[1]) {
case 'o':
if (word.substr(2, 2) == "ol") {
return TOKEN_BOOL;
case 'r':
if (word.substr(2, 3) == "eak") {
return TOKEN_BREAK;
}
}
break;
case 'c':
switch (word[1]) {
case 'o':
if (word.substr(2, 6) == "ntinue") {
return TOKEN_CONTINUE;
}
}
break;
case 'e':
if (word.substr(1, 3) == "lse") {
if (length == 3 && word[3] == 'i') {
return TOKEN_ELSE_IF;
}
return TOKEN_ELSE;
}
return scanner_rest(word, length, 1, 3, "lse", TOKEN_ELSE);
case 'f': {
switch (word[1]) {
case 'n':
return scanner_rest(word, length, 2, 0, "", TOKEN_FN);
case 'a':
return scanner_rest(word, length, 2, 3, "lse", TOKEN_FALSE);
case 'l':
return scanner_rest(word, length, 2, 3, "oat", TOKEN_FLOAT);
case '3':
return scanner_rest(word, length, 2, 1, "2", TOKEN_F32);
case '6':
return scanner_rest(word, length, 2, 1, "4", TOKEN_F64);
case 'o':
return scanner_rest(word, length, 2, 1, "r", TOKEN_FOR);
}
break;
}
case 'g':
return scanner_rest(word, length, 1, 1, "o", TOKEN_GO);
case 'i': {
if (length == 2 && word[1] == 'n') {
return TOKEN_IN;
} else if (length == 2 && word[1] == 's') {
return TOKEN_IS;
} else if (length == 3 && word[1] == 'n' && word[2] == 't') {
return TOKEN_INT;
}
switch (word[1]) {
case 'm':
return scanner_rest(word, length, 2, 4, "port", TOKEN_IMPORT);
case 'f':
return scanner_rest(word, length, 2, 0, "", TOKEN_IF);
case 'n':
return scanner_rest(word, length, 2, 7, "terface", TOKEN_INTERFACE);
case '8':
return scanner_rest(word, length, 2, 0, "", TOKEN_I8);
case '1':
return scanner_rest(word, length, 2, 1, "6", TOKEN_I16);
case '3':
return scanner_rest(word, length, 2, 1, "2", TOKEN_I32);
case '6':
return scanner_rest(word, length, 2, 1, "4", TOKEN_I64);
}
break;
}
case 'l': {
return scanner_rest(word, length, 1, 2, "et", TOKEN_LET);
}
case 'n':
switch (word[1]) {
case 'u': // null
return scanner_rest(word, length, 2, 2, "ll", TOKEN_NULL);
// case 'e':// new, new 识别成 ident 在 parser 采用固定语法结构时才会被识别成 new
// return scanner_rest(word, length, 2, 1, "w", TOKEN_NEW);
}
break;
case 'p':
return scanner_rest(word, length, 1, 2, "tr", TOKEN_PTR);
case 's': {
// self,string,struct,sizeof,sett
switch (word[1]) {
case 'e': {
switch (word[2]) {
case 't':
return scanner_rest(word, length, 3, 0, "", TOKEN_SET);
case 'l': // select
return scanner_rest(word, length, 3, 3, "ect", TOKEN_SELECT);
}
}
}
if (length == 6 && word[1] == 't' && word[2] == 'r') {
switch (word[3]) {
case 'i':
return scanner_rest(word, length, 4, 2, "ng", TOKEN_STRING);
case 'u':
return scanner_rest(word, length, 4, 2, "ct", TOKEN_STRUCT);
}
}
break;
}
case 't': {
// tup/throw/type/true
switch (word[1]) {
case 'h':
return scanner_rest(word, length, 2, 3, "row", TOKEN_THROW);
case 'y': // type
return scanner_rest(word, length, 2, 2, "pe", TOKEN_TYPE);
case 'u': // tup
return scanner_rest(word, length, 2, 1, "p", TOKEN_TUP);
case 'r': {
switch (word[2]) {
case 'y':
return scanner_rest(word, length, 3, 0, "", TOKEN_TRY);
case 'u':
return scanner_rest(word, length, 3, 1, "e", TOKEN_TRUE);
}
break;
}
}
break;
}
case 'v': {
switch (word[1]) {
case 'a':
return scanner_rest(word, length, 2, 1, "r", TOKEN_VAR);
case 'e': // vec
return scanner_rest(word, length, 2, 1, "c", TOKEN_VEC);
case 'o': // void
return scanner_rest(word, length, 2, 2, "id", TOKEN_VOID);
}
}
case 'u': {
switch (word[1]) {
case 'i':
return scanner_rest(word, length, 2, 2, "nt", TOKEN_UINT);
case '8':
return scanner_rest(word, length, 2, 0, "", TOKEN_U8);
case '1':
return scanner_rest(word, length, 2, 1, "6", TOKEN_U16);
case '3':
return scanner_rest(word, length, 2, 1, "2", TOKEN_U32);
case '6':
return scanner_rest(word, length, 2, 1, "4", TOKEN_U64);
}
break;
}
case 'm': {
// map
switch (word[1]) {
case 'a': {
switch (word[2]) {
case 'p':
return scanner_rest(word, length, 3, 0, "", TOKEN_MAP);
case 't':
return scanner_rest(word, length, 3, 2, "ch", TOKEN_MATCH);
}
}
}
}
case 'r': {
return scanner_rest(word, length, 1, 5, "eturn", TOKEN_RETURN);
}
}
return TOKEN_IDENT;
}
inline token_t item() {
module.s_cursor.length = 0; // 重置长度
module.s_cursor.guard = module.s_cursor.current; // 重置游标位置
if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) {
std::string word = ident_advance();
return token_t(ident)
}
}
};