topazc
lexer.cpp
Go to the documentation of this file.
1
6
9
10std::vector<Token> Lexer::tokenize() {
11 std::vector<Token> tokens;
12
13 while (pos < source_len) {
14 const char c = peek();
15 if (c == ' ' || c == '\n') {
16 advance();
17 }
18 else if (c == '/') {
19 if (peek(1) == '/') {
21 }
22 else {
23 tokens.push_back(tokenize_op());
24 }
25 }
26 else if (isalpha(c) || c == '_') {
27 tokens.push_back(tokenize_id());
28 }
29 else if (isdigit(c)) {
30 tokens.push_back(tokenize_number_lit());
31 }
32 else if (c == '\"') {
33 tokens.push_back(tokenize_string_lit());
34 }
35 else if (c == '\'') {
36 tokens.push_back(tokenize_character_lit());
37 }
38 else {
39 tokens.push_back(tokenize_op());
40 }
41 }
42
43 return tokens;
44}
45
47 std::string value;
48 uint32_t tmp_l = line;
49 uint32_t tmp_c = column;
50
51 while (pos < source_len && (isalpha(peek()) || isdigit(peek()) || peek() == '_')) {
52 value += advance();
53 }
54
55 if (keywords.find(value) != keywords.end()) {
56 return Token(keywords[value], value, tmp_l, tmp_c, file_name);
57 }
58 else if (value == "true" || value == "false") {
59 return Token(TOK_BOOLEAN_LIT, value, tmp_l, tmp_c, file_name);
60 }
61 return Token(TOK_ID, value, tmp_l, tmp_c, file_name);
62}
63
65 std::string value;
66 uint32_t tmp_l = line;
67 uint32_t tmp_c = column;
68 bool has_dot = false;
69
70 while (pos < source_len && (isdigit(peek()) || peek() == '.' || peek() == '_')) {
71 if (peek() == '_') {
72 advance();
73 continue;
74 }
75 else if (peek() == '.') {
76 if (has_dot) {
77 throw_exception(SUB_LEXER, "Invalid number literal: twice dot", line, file_name, is_debug);
78 }
79 else if (pos < source_len && peek(1) == '_') {
80 throw_exception(SUB_LEXER, "Invalid number literal: \033[0m'_'\033[31m cannot be immediately after the dot", line, file_name, is_debug);
81 }
82 else if (pos < source_len && !isdigit(peek(1))) {
83 throw_exception(SUB_LEXER, "Invalid number literal: dot cannot be the end", line, file_name, is_debug);
84 }
85 has_dot = true;
86 }
87 value += advance();
88 }
89
90 const char suffix = pos < source_len ? peek() : '\0';
91 switch (tolower(suffix)) {
92 case 'f':
93 advance();
94 return Token(TOK_FLOAT_LIT, value, tmp_l, tmp_c, file_name);
95 case 's':
96 if (has_dot) {
97 throw_exception(SUB_LEXER, "Invalid number literal: specified suffix \033[0m's'\033[31m does not match for floating point literal", line, file_name, is_debug);
98 }
99 advance();
100 return Token(TOK_SHORT_LIT, value, tmp_l, tmp_c, file_name);
101 case 'l':
102 if (has_dot) {
103 throw_exception(SUB_LEXER, "Invalid number literal: specified suffix \033[0m'l'\033[31m does not match for floating point literal", line, file_name, is_debug);
104 }
105 advance();
106 return Token(TOK_LONG_LIT, value, tmp_l, tmp_c, file_name);
107 default:
108 if (has_dot) {
109 return Token(TOK_DOUBLE_LIT, value, tmp_l, tmp_c, file_name);
110 }
111 else {
112 return Token(TOK_INT_LIT, value, tmp_l, tmp_c, file_name);
113 }
114 }
115}
116
118 std::string value;
119 uint32_t tmp_l = line;
120 uint32_t tmp_c = column;
121
122 advance();
123 while (pos < source_len && peek() != '\"') {
124 char c = advance();
125 if (c == '\\') {
127 }
128 value += c;
129 }
130 if (pos == source_len) {
131 throw_exception(SUB_LEXER, "Invalid string literal: missed closing double quote", line, file_name, is_debug);
132 }
133 advance();
134
135 return Token(TOK_STRING_LIT, value, tmp_l, tmp_c, file_name);
136}
137
139 std::string value;
140 uint32_t tmp_l = line;
141 uint32_t tmp_c = column;
142
143 advance();
144 while (pos < source_len && peek() != '\'') {
145 char c = advance();
146 if (c == '\\') {
148 }
149 value += c;
150 }
151 if (pos == source_len) {
152 throw_exception(SUB_LEXER, "Invalid character literal: missed closing single quote", line, file_name, is_debug);
153 }
154 else if (value.length() != 1) {
155 throw_exception(SUB_LEXER, "Invalid character literal: length should be equal to 1", line, file_name, is_debug);
156 }
157 advance();
158
159 return Token(TOK_CHARACTER_LIT, value, tmp_l, tmp_c, file_name);
160}
161
163 uint32_t tmp_l = line;
164 uint32_t tmp_c = column;
165 const char c = advance();
166
167 switch (c) {
168 case '+':
169 if (peek() == '=') {
170 advance();
171 return Token(TOK_OP_PLUS_EQ, "+=", tmp_l, tmp_c, file_name);
172 }
173 else if (peek() == '+') {
174 advance();
175 return Token(TOK_OP_INC, "++", tmp_l, tmp_c, file_name);
176 }
177 return Token(TOK_OP_PLUS, "+", tmp_l, tmp_c, file_name);
178 case '-':
179 if (peek() == '=') {
180 advance();
181 return Token(TOK_OP_MINUS_EQ, "-=", tmp_l, tmp_c, file_name);
182 }
183 else if (peek() == '-') {
184 advance();
185 return Token(TOK_OP_DEC, "--", tmp_l, tmp_c, file_name);
186 }
187 else if (peek() == '>') {
188 advance();
189 return Token(TOK_OP_NEXT, "->", tmp_l, tmp_c, file_name);
190 }
191 return Token(TOK_OP_MINUS, "-", tmp_l, tmp_c, file_name);
192 case '*':
193 if (peek() == '=') {
194 advance();
195 return Token(TOK_OP_MULT_EQ, "*=", tmp_l, tmp_c, file_name);
196 }
197 return Token(TOK_OP_MULT, "*", tmp_l, tmp_c, file_name);
198 case '/':
199 if (peek() == '=') {
200 advance();
201 return Token(TOK_OP_DIV_EQ, "/=", tmp_l, tmp_c, file_name);
202 }
203 return Token(TOK_OP_DIV, "/", tmp_l, tmp_c, file_name);
204 case '%':
205 if (peek() == '=') {
206 advance();
207 return Token(TOK_OP_MODULO_EQ, "%=", tmp_l, tmp_c, file_name);
208 }
209 return Token(TOK_OP_MODULO, "%", tmp_l, tmp_c, file_name);
210 case '=':
211 if (peek() == '=') {
212 advance();
213 return Token(TOK_OP_EQ_EQ, "==", tmp_l, tmp_c, file_name);
214 }
215 return Token(TOK_OP_EQ, "=", tmp_l, tmp_c, file_name);
216 case '!':
217 if (peek() == '=') {
218 advance();
219 return Token(TOK_OP_NOT_EQ_EQ, "!=", tmp_l, tmp_c, file_name);
220 }
221 return Token(TOK_OP_L_NOT, "!", tmp_l, tmp_c, file_name);
222 case '>':
223 if (peek() == '=') {
224 advance();
225 return Token(TOK_OP_GT_EQ, ">=", tmp_l, tmp_c, file_name);
226 }
227 return Token(TOK_OP_GT, ">", tmp_l, tmp_c, file_name);
228 case '<':
229 if (peek() == '=') {
230 advance();
231 return Token(TOK_OP_LS_EQ, "<=", tmp_l, tmp_c, file_name);
232 }
233 return Token(TOK_OP_LS, "<", tmp_l, tmp_c, file_name);
234 case '&':
235 if (peek() == '&') {
236 advance();
237 return Token(TOK_OP_L_AND, "&&", tmp_l, tmp_c, file_name);
238 }
239 return Token(TOK_OP_REF, "&", tmp_l, tmp_c, file_name);
240 case '|':
241 if (peek() == '|') {
242 advance();
243 return Token(TOK_OP_L_OR, "||", tmp_l, tmp_c, file_name);
244 }
245 throw_exception(SUB_LEXER, "Operator '|' (aka bitwise or) is unsupported", line, file_name, is_debug);
246 case ',':
247 return Token(TOK_OP_COMMA, ",", tmp_l, tmp_c, file_name);
248 case '.':
249 return Token(TOK_OP_DOT, ".", tmp_l, tmp_c, file_name);
250 case ':':
251 return Token(TOK_OP_COLON, ":", tmp_l, tmp_c, file_name);
252 case ';':
253 return Token(TOK_OP_SEMICOLON, ";", tmp_l, tmp_c, file_name);
254 case '(':
255 return Token(TOK_OP_LPAREN, "(", tmp_l, tmp_c, file_name);
256 case ')':
257 return Token(TOK_OP_RPAREN, ")", tmp_l, tmp_c, file_name);
258 case '{':
259 return Token(TOK_OP_LBRACE, "{", tmp_l, tmp_c, file_name);
260 case '}':
261 return Token(TOK_OP_RBRACE, "}", tmp_l, tmp_c, file_name);
262 case '[':
263 return Token(TOK_OP_LBRACKET, "[", tmp_l, tmp_c, file_name);
264 case ']':
265 return Token(TOK_OP_RBRACKET, "]", tmp_l, tmp_c, file_name);
266 case '?':
267 return Token(TOK_OP_QUESTION, "?", tmp_l, tmp_c, file_name);
268 default:
269 std::stringstream ss;
270 ss << "Unsupported operator: \033[0m'" << c << "'";
272 }
273}
274
276 advance();
277 advance();
278 while (pos < source_len && peek() != '\n') {
279 advance();
280 }
281}
282
284 const char c = advance();
285 switch (c) {
286 case 'n':
287 return '\n';
288 case 't':
289 return '\t';
290 case 'v':
291 return '\v';
292 case 'b':
293 return '\b';
294 case 'r':
295 return '\r';
296 case 'f':
297 return '\f';
298 case 'a':
299 return '\a';
300 case '\\':
301 return '\\';
302 case '\'':
303 return '\'';
304 case '"':
305 return '\"';
306 case '?':
307 return '\?';
308 default:
309 std::stringstream ss;
310 ss << "Unsupported escape sequence: \033[0m'\\" << c;
312 }
313}
314
315const char Lexer::peek(int32_t rpos) const {
316 if (pos + rpos >= source_len || pos + rpos < 0) {
317 std::stringstream ss;
318 ss << "Index out of range: " << pos + rpos << '/' << source_len;
320 }
321 return source[pos + rpos];
322}
323
324const char Lexer::advance() {
325 const char c = peek();
326 pos++;
327 column++;
328 if (c == '\n') {
329 line++;
330 column = 1;
331 }
332 return c;
333}
std::string file_name
Definition lexer.hpp:17
Token tokenize_character_lit()
Method for tokenizing character literal.
Definition lexer.cpp:138
const char get_escape_sequence()
Method for getting escape-sequence in string or character literal.
Definition lexer.cpp:283
Token tokenize_string_lit()
Method for tokenizing string literal.
Definition lexer.cpp:117
Token tokenize_number_lit()
Method for tokenizing number literal.
Definition lexer.cpp:64
std::string source
Definition lexer.hpp:18
uint32_t line
Definition lexer.hpp:21
bool is_debug
Definition lexer.hpp:50
uint32_t pos
Definition lexer.hpp:20
const char advance()
Method for skipping current character from source code and returns it.
Definition lexer.cpp:324
void skip_comments()
Method for skipping comments.
Definition lexer.cpp:275
uint32_t column
Definition lexer.hpp:22
size_t source_len
Definition lexer.hpp:19
const char peek(int32_t rpos=0) const
Method for getting character from source code by lexer pos and passed offset.
Definition lexer.cpp:315
Token tokenize_op()
Method for tokenizing operator.
Definition lexer.cpp:162
Token tokenize_id()
Method for tokenizing identifier token.
Definition lexer.cpp:46
std::map< std::string, TokenType > keywords
Definition lexer.hpp:23
std::vector< Token > tokenize()
Method for tokenizing source code.
Definition lexer.cpp:10
void throw_exception(SubsystemType type, std::string msg, uint32_t line, std::string file_name, bool is_debug)
Function for throwing exception.
Definition exception.cpp:30
Header file for defining thrown exceptions by the compiler.
@ SUB_LEXER
Definition exception.hpp:15
Header file for defining the lexer.
Token structure.
Definition token.hpp:92
@ TOK_OP_DEC
Definition token.hpp:57
@ TOK_OP_RBRACKET
Definition token.hpp:84
@ TOK_OP_DIV_EQ
Definition token.hpp:61
@ TOK_OP_COMMA
Definition token.hpp:75
@ TOK_OP_DIV
Definition token.hpp:60
@ TOK_OP_NEXT
Definition token.hpp:86
@ TOK_OP_LBRACKET
Definition token.hpp:83
@ TOK_OP_EQ_EQ
Definition token.hpp:65
@ TOK_OP_LS
Definition token.hpp:69
@ TOK_OP_GT_EQ
Definition token.hpp:68
@ TOK_CHARACTER_LIT
Definition token.hpp:43
@ TOK_OP_MINUS_EQ
Definition token.hpp:56
@ TOK_OP_LBRACE
Definition token.hpp:81
@ TOK_STRING_LIT
Definition token.hpp:50
@ TOK_OP_RBRACE
Definition token.hpp:82
@ TOK_SHORT_LIT
Definition token.hpp:44
@ TOK_INT_LIT
Definition token.hpp:45
@ TOK_OP_LPAREN
Definition token.hpp:79
@ TOK_OP_NOT_EQ_EQ
Definition token.hpp:66
@ TOK_OP_PLUS_EQ
Definition token.hpp:53
@ TOK_ID
Definition token.hpp:42
@ TOK_OP_L_OR
Definition token.hpp:73
@ TOK_OP_SEMICOLON
Definition token.hpp:78
@ TOK_OP_MULT_EQ
Definition token.hpp:59
@ TOK_OP_RPAREN
Definition token.hpp:80
@ TOK_OP_INC
Definition token.hpp:54
@ TOK_OP_QUESTION
Definition token.hpp:85
@ TOK_OP_PLUS
Definition token.hpp:52
@ TOK_OP_COLON
Definition token.hpp:77
@ TOK_OP_L_NOT
Definition token.hpp:71
@ TOK_OP_DOT
Definition token.hpp:76
@ TOK_OP_EQ
Definition token.hpp:64
@ TOK_OP_MODULO_EQ
Definition token.hpp:63
@ TOK_FLOAT_LIT
Definition token.hpp:47
@ TOK_OP_L_AND
Definition token.hpp:72
@ TOK_DOUBLE_LIT
Definition token.hpp:48
@ TOK_OP_MODULO
Definition token.hpp:62
@ TOK_OP_MULT
Definition token.hpp:58
@ TOK_OP_REF
Definition token.hpp:74
@ TOK_LONG_LIT
Definition token.hpp:46
@ TOK_OP_GT
Definition token.hpp:67
@ TOK_OP_MINUS
Definition token.hpp:55
@ TOK_OP_LS_EQ
Definition token.hpp:70
@ TOK_BOOLEAN_LIT
Definition token.hpp:49