705 lines
24 KiB
C++
705 lines
24 KiB
C++
#pragma once
|
|
#include "doctest.h"
|
|
#include "token.h"
|
|
#include "../types.hpp"
|
|
#include <cctype>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
class Scanner {
|
|
public:
|
|
Scanner(module_t& module)
|
|
: module(module) {}
|
|
|
|
inline std::vector<token_t> scan() {
|
|
std::vector<token_t> tokens;
|
|
|
|
while (!at_eof()) {
|
|
|
|
if (skip_space()) {
|
|
// 如果是空格或换行,则跳过
|
|
continue;
|
|
}
|
|
|
|
token_t token = item();
|
|
tokens.push_back(token);
|
|
}
|
|
|
|
tokens.push_back(token_t(TOKEN_EOF, "EOF", module.s_cursor.line, module.s_cursor.column));
|
|
|
|
// 词法分析后实例化语法分析相关变量
|
|
module.parser_cursor.tokens = tokens;
|
|
module.parser_cursor.current = 0;
|
|
return tokens;
|
|
}
|
|
|
|
|
|
private:
|
|
module_t& module;
|
|
|
|
inline std::string gen_word() {
|
|
return module.s_cursor.source.substr(module.s_cursor.current, module.s_cursor.length);
|
|
}
|
|
|
|
inline bool is_space(char c) {
|
|
if (c == '\n' || c == '\t' || c == '\r' || c == ' ') {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
inline bool is_string(char s) {
|
|
return s == '"';
|
|
}
|
|
|
|
inline bool is_float(std::string word) {
|
|
// 是否包含 .,包含则为 float
|
|
int dot_count = 0;
|
|
bool has_e = false;
|
|
|
|
for (std::string::size_type i = 0; i < word.size(); i++) {
|
|
if (word[i] == '.')
|
|
dot_count++;
|
|
else if (word[i] == 'e' || word[i] == 'E')
|
|
has_e = true;
|
|
}
|
|
|
|
// 结尾不能是 .
|
|
if (word[-1] == '.') {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". floating-point numbers cannot end with '.'";
|
|
return false;
|
|
}
|
|
|
|
// 如果有科学计数法标记,则认为是浮点数
|
|
if (has_e) {
|
|
return true;
|
|
}
|
|
|
|
if (dot_count == 0) {
|
|
return false;
|
|
}
|
|
|
|
if (dot_count > 1) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". floating-point numbers have multiple '.'";
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline bool is_alpha(char c) {
|
|
return std::isalpha(c);
|
|
}
|
|
|
|
inline bool is_number(char c) {
|
|
return std::isdigit(c);
|
|
}
|
|
|
|
inline bool is_hex_number(char c) {
|
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
|
}
|
|
|
|
inline bool is_oct_number(char c) {
|
|
return c >= '0' && c <= '7';
|
|
}
|
|
|
|
inline bool is_bin_number(char c) {
|
|
return c == '0' || c == '1';
|
|
}
|
|
|
|
inline bool at_eof() {
|
|
return module.s_cursor.source[module.s_cursor.guard] == '\0';
|
|
}
|
|
|
|
inline char guard_advance() {
|
|
module.s_cursor.guard++;
|
|
module.s_cursor.length++;
|
|
module.s_cursor.column++;
|
|
|
|
if (module.s_cursor.source[module.s_cursor.guard] == '\n') {
|
|
module.s_cursor.line++;
|
|
module.s_cursor.column = 0;
|
|
}
|
|
|
|
return module.s_cursor.source[module.s_cursor.guard];
|
|
}
|
|
|
|
inline bool match(char expected) {
|
|
if (at_eof())
|
|
return false;
|
|
|
|
if (module.source[module.s_cursor.guard] != expected)
|
|
return false;
|
|
|
|
guard_advance();
|
|
return true;
|
|
}
|
|
|
|
inline std::string ident_advance() {
|
|
while((is_alpha(module.s_cursor.source[module.s_cursor.guard]) ||
|
|
is_number(module.s_cursor.source[module.s_cursor.guard])) &&
|
|
!at_eof()) {
|
|
guard_advance();
|
|
|
|
}
|
|
|
|
return gen_word();
|
|
}
|
|
|
|
inline token_type_t special_char() {
|
|
char c = module.s_cursor.source[module.s_cursor.guard];
|
|
guard_advance();
|
|
switch (c) {
|
|
case '(':
|
|
return TOKEN_LEFT_PAREN;
|
|
case ')':
|
|
return TOKEN_RIGHT_PAREN;
|
|
case '[':
|
|
return TOKEN_LEFT_SQUARE;
|
|
case ']':
|
|
return TOKEN_RIGHT_SQUARE;
|
|
case '{':
|
|
return TOKEN_LEFT_CURLY;
|
|
case '}':
|
|
return TOKEN_RIGHT_CURLY;
|
|
case ':':
|
|
return TOKEN_COLON;
|
|
case ';':
|
|
return TOKEN_STMT_EOF;
|
|
case ',':
|
|
return TOKEN_COMMA;
|
|
case '?':
|
|
return TOKEN_QUESTION;
|
|
case '%':
|
|
return match('=') ? TOKEN_PERCENT_EQUAL : TOKEN_PERCENT;
|
|
case '-':
|
|
if (match('=')) {
|
|
return TOKEN_MINUS_EQUAL;
|
|
}
|
|
if (match('>')) {
|
|
return TOKEN_RIGHT_ARROW;
|
|
}
|
|
|
|
return TOKEN_MINUS;
|
|
case '+':
|
|
return match('=') ? TOKEN_PLUS_EQUAL : TOKEN_PLUS;
|
|
case '/':
|
|
return match('=') ? TOKEN_SLASH_EQUAL : TOKEN_SLASH;
|
|
case '*': {
|
|
return match('=') ? TOKEN_STAR_EQUAL : TOKEN_STAR;
|
|
}
|
|
case '.': {
|
|
return TOKEN_DOT;
|
|
}
|
|
case '!':
|
|
return match('=') ? TOKEN_NOT_EQUAL : TOKEN_NOT;
|
|
case '=':
|
|
return match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL;
|
|
case '<':
|
|
if (match('<')) {
|
|
// <<
|
|
if (match('=')) {
|
|
// <<=
|
|
return TOKEN_LEFT_SHIFT_EQUAL;
|
|
}
|
|
// <<
|
|
return TOKEN_LEFT_SHIFT;
|
|
} else if (match('=')) {
|
|
return TOKEN_LESS_EQUAL;
|
|
}
|
|
return TOKEN_LEFT_ANGLE;
|
|
case '>': {
|
|
if (match('=')) {
|
|
// >=
|
|
return TOKEN_GREATER_EQUAL;
|
|
}
|
|
if (match('>') && match('=')) {
|
|
return TOKEN_RIGHT_SHIFT_EQUAL;
|
|
}
|
|
|
|
return TOKEN_RIGHT_ANGLE; // >
|
|
}
|
|
case '&':
|
|
return match('&') ? TOKEN_AND_AND : TOKEN_AND;
|
|
case '|':
|
|
return match('|') ? TOKEN_OR_OR : TOKEN_OR;
|
|
case '~':
|
|
return TOKEN_TILDE;
|
|
case '^':
|
|
return match('=') ? TOKEN_XOR_EQUAL : TOKEN_XOR;
|
|
default:
|
|
return token_type_t::TOKEN_NOT_IN_THIS_TYPE;
|
|
}
|
|
|
|
}
|
|
|
|
inline std::string string_advance() {
|
|
module.s_cursor.guard++;
|
|
char escape_char = '\\';
|
|
|
|
std::stringstream buf;
|
|
|
|
while (module.s_cursor.source[module.s_cursor.guard] != '\"' && !at_eof()) {
|
|
char guard = module.s_cursor.source[module.s_cursor.guard];
|
|
|
|
if (guard == '\n') {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". string cannot newline.";
|
|
}
|
|
|
|
// 处理转义字符
|
|
if (guard == escape_char) {
|
|
// 跳过转义字符第一个
|
|
module.s_cursor.guard++;
|
|
|
|
guard = module.s_cursor.source[module.s_cursor.guard];
|
|
|
|
switch (guard) {
|
|
case 'n':
|
|
guard = '\n';
|
|
break;
|
|
case 't':
|
|
guard = '\t';
|
|
break;
|
|
case 'r':
|
|
guard = '\r';
|
|
break;
|
|
case 'b':
|
|
guard = '\b';
|
|
break;
|
|
case 'f':
|
|
guard = '\f';
|
|
break;
|
|
case 'a':
|
|
guard = '\a';
|
|
break;
|
|
case 'v':
|
|
guard = '\v';
|
|
break;
|
|
case '0':
|
|
guard = '\0';
|
|
break;
|
|
case '\\':
|
|
case '\'':
|
|
case '\"':
|
|
break;
|
|
default:
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". unknown escape char " << guard;
|
|
}
|
|
}
|
|
|
|
buf << guard;
|
|
guard_advance();
|
|
}
|
|
|
|
//跳过close char
|
|
module.s_cursor.guard++;
|
|
|
|
return buf.str();
|
|
}
|
|
|
|
inline long number_convert(std::string word, int base) {
|
|
try {
|
|
long decimal = std::stol(word, 0, base);
|
|
return decimal;
|
|
} catch (const std::invalid_argument& e) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". Invalid number: " << word << std::endl;
|
|
return 0;
|
|
} catch (const std::out_of_range& e) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". Number out of range: " << word << std::endl;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
inline double number_convert_float(std::string word) {
|
|
try {
|
|
double decimal = std::stod(word);
|
|
return decimal;
|
|
} catch (const std::invalid_argument& e) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". Invalid number: " << word << std::endl;
|
|
return 0;
|
|
} catch (const std::out_of_range& e) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". Number out of range: " << word << std::endl;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
inline std::string hex_number_advance() {
|
|
module.s_cursor.guard += 2; // 跳过 0x
|
|
|
|
while (is_hex_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
|
|
return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
|
|
}
|
|
|
|
inline std::string oct_number_advance() {
|
|
module.s_cursor.guard += 2; // 跳过 0o
|
|
|
|
while (is_oct_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
|
|
return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
|
|
}
|
|
|
|
inline std::string bin_number_advance() {
|
|
module.s_cursor.guard += 2; // 跳过 0b
|
|
|
|
while (is_bin_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
|
|
return module.s_cursor.source.substr(module.s_cursor.current+2, module.s_cursor.length);
|
|
}
|
|
|
|
inline std::string number_advance() {
|
|
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
|
|
// 处理小数部分
|
|
if (module.s_cursor.source[module.s_cursor.guard] == '.' && is_number(module.s_cursor.source[module.s_cursor.guard + 1])) {
|
|
guard_advance(); // 跳过小数点
|
|
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
}
|
|
|
|
// 处理科学计数法
|
|
if ((module.s_cursor.source[module.s_cursor.guard] == 'e' || module.s_cursor.source[module.s_cursor.guard] == 'E')
|
|
&& (is_number(module.s_cursor.source[module.s_cursor.guard + 1]) ||
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == '+' ||
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == '-')) {
|
|
guard_advance(); // 跳过 e 或 E
|
|
if (module.s_cursor.source[module.s_cursor.guard] == '+' || module.s_cursor.source[module.s_cursor.guard] == '-') {
|
|
guard_advance(); // 跳过符号
|
|
}
|
|
while(is_number(module.s_cursor.source[module.s_cursor.guard]) && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
}
|
|
|
|
return gen_word();
|
|
}
|
|
|
|
static token_type_t scanner_ident(std::string word, int length) {
|
|
switch (word[0]) {
|
|
case 'a': {
|
|
switch (word[1]) {
|
|
case 'r': {
|
|
if (word.substr(2, 3) == "ray" && word.size() == 5) {
|
|
return TOKEN_ARR;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'b': {
|
|
switch (word[1]) {
|
|
case 'o':
|
|
if (word.substr(2, 2) == "ol" && word.size() == 4) {
|
|
return TOKEN_BOOL;
|
|
}
|
|
case 'r':
|
|
if (word.substr(2, 3) == "eak" && word.size() == 5) {
|
|
return TOKEN_BREAK;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'c': {
|
|
switch (word[1]) {
|
|
case 'o':
|
|
if (word.substr(2, 6) == "ntinue" && word.size() == 8) {
|
|
return TOKEN_CONTINUE;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'e': {
|
|
if (word.substr(1, 3) == "lse" && word.size() == 4) {
|
|
return TOKEN_ELSE;
|
|
}
|
|
}
|
|
case 'f': {
|
|
switch (word[1]) {
|
|
case 'n':
|
|
if (word.size() == 2) {
|
|
return TOKEN_FN;
|
|
}
|
|
case 'a':
|
|
if (word.substr(2, 3) == "lse" && word.size() == 5) {
|
|
return TOKEN_FALSE;
|
|
}
|
|
case '3':
|
|
if (word.substr(2, 1) == "2" && word.size() == 3) {
|
|
return TOKEN_F32;
|
|
}
|
|
case '6':
|
|
if (word.substr(2, 1) == "4" && word.size() == 3) {
|
|
return TOKEN_F64;
|
|
}
|
|
case 'o':
|
|
if (word.substr(2, 1) == "r" && word.size() == 3) {
|
|
return TOKEN_FOR;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'i': {
|
|
switch (word[1]) {
|
|
case 'f':
|
|
if (word.size() == 2) {
|
|
return TOKEN_IF;
|
|
}
|
|
case '8':
|
|
if (word.size() == 2) {
|
|
return TOKEN_I8;
|
|
}
|
|
case '1':
|
|
if (word.substr(2, 1) == "6" && word.size() == 3) {
|
|
return TOKEN_I16;
|
|
}
|
|
case '3':
|
|
if (word.substr(2, 1) == "2" && word.size() == 3) {
|
|
return TOKEN_I32;
|
|
}
|
|
case '6':
|
|
if (word.substr(2, 1) == "4" && word.size() == 3) {
|
|
return TOKEN_I64;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 's': {
|
|
// self,string,struct,sizeof,sett
|
|
|
|
if (length == 6 && word[1] == 't' && word[2] == 'r') {
|
|
switch (word[3]) {
|
|
case 'i':
|
|
if (word.substr(4, 2) == "ng" && word.size() == 6) {
|
|
return TOKEN_STRING;
|
|
}
|
|
case 'u':
|
|
if (word.substr(4, 2) == "ct" && word.size() == 6) {
|
|
return TOKEN_STRUCT;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 't': {
|
|
// tup/throw/type/true
|
|
switch (word[1]) {
|
|
case 'y': // type
|
|
if (word.substr(2, 2) == "pe" && word.size() == 4) {
|
|
return TOKEN_TYPE;
|
|
}
|
|
case 'u': // tup
|
|
if (word.substr(2, 1) == "p" && word.size() == 3) {
|
|
return TOKEN_TUP;
|
|
}
|
|
case 'r': {
|
|
switch (word[2]) {
|
|
case 'u':
|
|
if (word.substr(3, 1) == "e" && word.size() == 4) {
|
|
return TOKEN_TRUE;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'v': {
|
|
switch (word[1]) {
|
|
case 'a':
|
|
if (word.substr(2, 1) == "r" && word.size() == 3) {
|
|
return TOKEN_VAR;
|
|
}
|
|
case 'e': // vec
|
|
if (word.substr(2, 1) == "c" && word.size() == 3) {
|
|
return TOKEN_VEC;
|
|
}
|
|
case 'o': // void
|
|
if (word.substr(2,2) == "id" && word.size() == 4) {
|
|
return TOKEN_VOID;
|
|
}
|
|
}
|
|
}
|
|
case 'u': {
|
|
switch (word[1]) {
|
|
case '8':
|
|
if (word.size() == 2) {
|
|
return TOKEN_U8;
|
|
}
|
|
case '1':
|
|
if (word.substr(2, 1) == "6" && word.size() == 3) {
|
|
return TOKEN_U16;
|
|
}
|
|
case '3':
|
|
if (word.substr(2, 1) == "2" && word.size() == 3) {
|
|
return TOKEN_U32;
|
|
}
|
|
case '6':
|
|
if (word.substr(2, 1) == "4" && word.size() == 3) {
|
|
return TOKEN_U64;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 'm': {
|
|
// map
|
|
switch (word[1]) {
|
|
case 'a': {
|
|
switch (word[2]) {
|
|
case 'p':
|
|
if (word.size() == 3) {
|
|
return TOKEN_MAP;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
case 'r': {
|
|
if (word.substr(1, 5) == "eturn" && word.size() == 6) {
|
|
// return
|
|
return TOKEN_RETURN;
|
|
}
|
|
}
|
|
}
|
|
|
|
return TOKEN_IDENT;
|
|
}
|
|
|
|
inline bool multi_comment_end() {
|
|
return module.s_cursor.source[module.s_cursor.guard] == '*' &&
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == '/';
|
|
}
|
|
|
|
inline bool skip_space() {
|
|
bool has_new = false;
|
|
|
|
if (module.s_cursor.guard != module.s_cursor.current) {
|
|
module.s_cursor.space_prev = module.s_cursor.source[module.s_cursor.guard - 1];
|
|
}
|
|
|
|
while (true) {
|
|
char c = module.s_cursor.source[module.s_cursor.guard];
|
|
switch (c) {
|
|
case ' ':
|
|
case '\r':
|
|
case '\t': {
|
|
guard_advance();
|
|
break;
|
|
}
|
|
case '\n': {
|
|
guard_advance();
|
|
has_new = true;
|
|
break;
|
|
}
|
|
case '/': {
|
|
if (module.s_cursor.source[module.s_cursor.guard + 1] == '/') {
|
|
// 单行注释
|
|
while (module.s_cursor.source[module.s_cursor.guard] != '\n' && !at_eof()) {
|
|
guard_advance();
|
|
}
|
|
break;
|
|
} else if (module.s_cursor.source[module.s_cursor.guard + 1] == '*') {
|
|
while (!multi_comment_end()) {
|
|
if (at_eof()) {
|
|
std::cerr << "[ERROR] line: " << module.s_cursor.line << ". column: "
|
|
<< module.s_cursor.column <<". multi comment not end.";
|
|
return false;
|
|
}
|
|
guard_advance();
|
|
}
|
|
|
|
guard_advance(); // 跳过 *
|
|
guard_advance(); // 跳过 /
|
|
break;
|
|
} else {
|
|
module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
|
|
return has_new;
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
module.s_cursor.space_next = module.s_cursor.source[module.s_cursor.guard];
|
|
return has_new;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
inline token_t item() {
|
|
module.s_cursor.length = 0; // 重置长度
|
|
module.s_cursor.current = module.s_cursor.guard; // 重置游标位置
|
|
if (is_alpha(module.s_cursor.source[module.s_cursor.guard])) {
|
|
std::string word = ident_advance();
|
|
return token_t(scanner_ident(word, word.size()),word, module.s_cursor.line, module.s_cursor.column);
|
|
}
|
|
|
|
if (is_number(module.s_cursor.source[module.s_cursor.guard])) {
|
|
std::string word;
|
|
long decimal = 0;
|
|
|
|
if (module.s_cursor.source[module.s_cursor.guard] == '0') {
|
|
// 可能是十六进制、八进制或二进制
|
|
if (module.s_cursor.source[module.s_cursor.guard + 1] == 'x' ||
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == 'X') {
|
|
word = hex_number_advance();
|
|
decimal = number_convert(word, 16);
|
|
} else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'o' ||
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == 'O') {
|
|
word = oct_number_advance();
|
|
decimal = number_convert(word, 8);
|
|
} else if (module.s_cursor.source[module.s_cursor.guard + 1] == 'b' ||
|
|
module.s_cursor.source[module.s_cursor.guard + 1] == 'B') {
|
|
word = bin_number_advance();
|
|
decimal = number_convert(word, 2);
|
|
}
|
|
else {
|
|
word = number_advance();
|
|
if (word.size() > 1) word = word.substr(1, word.size() - 1);
|
|
decimal = number_convert(word,10);
|
|
}
|
|
} else {
|
|
word = number_advance();
|
|
decimal = number_convert(word, 10);
|
|
}
|
|
|
|
token_type_t type;
|
|
if (is_float(word)) {
|
|
type = TOKEN_LITERAL_FLOAT;
|
|
} else {
|
|
type = TOKEN_LITERAL_INT;
|
|
}
|
|
return token_t(type, word, module.s_cursor.line, module.s_cursor.column);
|
|
}
|
|
if (is_string(module.s_cursor.source[module.s_cursor.guard])) {
|
|
std::string word = string_advance();
|
|
return token_t(TOKEN_LITERAL_STRING, word, module.s_cursor.line, module.s_cursor.column);
|
|
}
|
|
|
|
token_type_t type = special_char();
|
|
|
|
return token_t(type, gen_word(), module.s_cursor.line, module.s_cursor.column);
|
|
}
|
|
|
|
|
|
};
|
|
|