From d294a06820af3cdad70dbb6bfa2812dc7bfa6841 Mon Sep 17 00:00:00 2001 From: afonya2 Date: Wed, 23 Apr 2025 21:04:09 +0200 Subject: [PATCH] made the lexer --- .gitignore | 1 + Cargo.lock | 7 ++ Cargo.toml | 6 ++ src/lexer.rs | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 19 +++++ test.as | 8 ++ 6 files changed, 259 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lexer.rs create mode 100644 src/main.rs create mode 100644 test.as diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8c04896 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ASL" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0b58107 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "ASL" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..81d9392 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,218 @@ +#[derive(Debug, PartialEq)] +pub enum TokenType { + STRING, + NUMBER, + IDENTIFIER, + OPERATOR, + KEYWORD, + SEPARATOR +} +#[derive(Debug)] +pub struct Token { + pub typ: TokenType, + pub value: String, + pub pos: usize, +} + +fn is_ignored(char: &str) -> bool { + let chars = vec![""," ","\n","\r","\t"]; + return chars.contains(&char); +} +fn is_number(char: &str) -> bool { + let chars = vec!["0","1","2","3","4","5","6","7","8","9"]; + return chars.contains(&char); +} +fn is_operator(char: &str) -> bool { + let chars = vec!["+","-","*","/","^","%","|","&","!"]; + return chars.contains(&char); +} +fn is_sep(char: &str) -> bool { + let chars = vec!["(",")","[","]","{","}",",","."]; + return chars.contains(&char); +} + +fn read_string(splitted: &Vec<&str>, pos: &mut usize, out: &mut Vec) { + let mut str = String::from(""); + let start_pos = *pos-1; + *pos += 5; + let mut success = false; + while pos < &mut splitted.len() { + let nchar = splitted[*pos]; + *pos += 1; + if splitted.len() >= *pos+5 && splitted[(*pos-1)..(*pos+5)].join("") == "\"szaft" { + success = true; + break; + } + str += nchar; + } + if !success { + panic!("Unexpected end of string at {}", pos); + } + *pos += 5; + out.push(Token { typ: TokenType::STRING, value: str, pos: start_pos }); +} +fn read_comment(splitted: &Vec<&str>, pos: &mut usize, is_multiline: bool) { + let mut str = String::from(""); + *pos += 1; + let mut success = !is_multiline; + while pos < &mut splitted.len() { + let nchar = splitted[*pos]; + *pos += 1; + if (nchar == "\n" || nchar == "\r") && !is_multiline { + break; + } + if splitted.len() >= *pos+1 && splitted[(*pos-1)..(*pos+1)].join("") == "*/" { + success = true; + break; + } + str += nchar; + } + if !success { + panic!("Unexpected end of comment at {}", pos); + } + *pos += 1; +} +fn generate_combinations(words: Vec<&str>) -> Vec { + let mut result = vec![]; + let mut current = String::new(); + for word in words { + if !current.is_empty() { + current.push(' '); + } + current.push_str(&word); + result.push(current.clone()); + } + return result; +} +fn read_identifier(splitted: &Vec<&str>, pos: &mut usize, out: &mut Vec) { + let keywords = vec!["kraf","piszolj","ha nem geny akkor geny","ha nem geny","nem piszv","kopva","gethelj","ha geny","jukadban","lőcsve","nem reti","csecs","megint","reti","piszv","amíg geny"]; + let mut raw_keywords: Vec = vec![]; + for keyword in &keywords { + let spi: Vec<&str> = keyword.split(" ").collect(); + for word in &spi { + if !raw_keywords.contains(&String::from(*word)) { + raw_keywords.push(String::from(*word)); + } + } + let combos = generate_combinations(spi); + for word in combos { + if !raw_keywords.contains(&word) { + raw_keywords.push(word); + } + } + } + //Lets collect everything + let mut identifier = String::from(""); + let start_pos = *pos-1; + let mut word_pos: Vec = vec![start_pos]; + *pos -= 1; + while pos < &mut splitted.len() { + let prev_char = splitted[*pos-1]; + let char = splitted[*pos]; + *pos += 1; + if is_operator(char) || is_sep(char) || char == "\n" || char == "\r" { + break; + } + if prev_char == " " { + word_pos.push(*pos-1); + } + identifier += char; + } + //Lets break it up to tokens + let words: Vec<&str> = identifier.split(" ").collect(); + let mut state: Vec<&str> = vec![]; + let mut state_pos: usize = 0; + let mut i = 0; + let mut pre_out: Vec = vec![]; + for word in words { + if raw_keywords.contains(&String::from(word)) { + if state_pos > 0 { + pre_out.push(Token { typ: TokenType::IDENTIFIER, value: state.join(" "), pos: state_pos }); + } + if let Some(prev_token) = pre_out.last_mut() { + if prev_token.typ == TokenType::KEYWORD { + //Lets check for possibilites! + let next_value = format!("{} {}", prev_token.value, word); + if raw_keywords.contains(&next_value) { + prev_token.value = next_value; + } else { + pre_out.push(Token { typ: TokenType::KEYWORD, value: String::from(word), pos: word_pos[i] }); + } + } else { + pre_out.push(Token { typ: TokenType::KEYWORD, value: String::from(word), pos: word_pos[i] }); + } + } else { + pre_out.push(Token { typ: TokenType::KEYWORD, value: String::from(word), pos: word_pos[i] }); + } + state = vec![]; + state_pos = 0; + } else { + state.push(word); + if state_pos == 0 { + state_pos = word_pos[i]; + } + } + i += 1; + } + pre_out.push(Token { typ: TokenType::IDENTIFIER, value: state.join(" "), pos: state_pos }); + //Check for invalid keywords! + for token in &mut pre_out { + if token.typ == TokenType::KEYWORD { + if !keywords.contains(&token.value.as_str()) { + token.typ = TokenType::IDENTIFIER; + } + } + } + //Check for identifiers next to each other + let mut i = 1; + while i < pre_out.len() { + let (left, right) = pre_out.split_at_mut(i); + let token = &right[0]; + let prev_token = &mut left[i-1]; + if token.typ == TokenType::IDENTIFIER && prev_token.typ == TokenType::IDENTIFIER { + prev_token.value = format!("{} {}", prev_token.value, token.value); + pre_out.remove(i); + i = 1 + } + i += 1; + } + out.extend(pre_out); +} + +pub fn lex(input: String) -> Vec { + let mut out: Vec = vec![]; + let splitted: Vec<&str> = input.split("").collect(); + let mut pos = 1; + while pos < splitted.len() { + let char = splitted[pos]; + pos += 1; + if is_ignored(char) { + continue; + } + if is_number(char) { + let mut num = String::from(char); + let start_pos = pos-1; + while pos < splitted.len() { + let nchar = splitted[pos]; + pos += 1; + if !is_number(nchar) { + break; + } + num += nchar; + } + pos -= 1; + out.push(Token { typ: TokenType::NUMBER, value: num, pos: start_pos }); + } else if splitted.len() >= pos+5 && splitted[(pos-1)..(pos+5)].join("") == "szaft\"" { + read_string(&splitted, &mut pos, &mut out); + } else if splitted.len() >= pos+1 && splitted[(pos-1)..(pos+1)].join("") == "//" { + read_comment(&splitted, &mut pos, false); + } else if splitted.len() >= pos+1 && splitted[(pos-1)..(pos+1)].join("") == "/*" { + read_comment(&splitted, &mut pos, true); + } else if is_operator(char) { + out.push(Token { typ: TokenType::OPERATOR, value: String::from(char), pos: pos-1 }); + } else { + read_identifier(&splitted, &mut pos, &mut out); + } + } + return out; +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..000f313 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,19 @@ +use std::fs; + +mod lexer; + +fn main() { + let inp = fs::read_to_string("./test.as"); + match inp { + Result::Ok(data) => { + let lexed = lexer::lex(data); + println!("Lexer output: "); + for token in lexed { + println!(" {}: {:?}: {}", token.pos, token.typ, token.value); + } + }, + Result::Err(err) => { + panic!("Error while reading file: {}", err) + } + } +} \ No newline at end of file diff --git a/test.as b/test.as new file mode 100644 index 0000000..c14683c --- /dev/null +++ b/test.as @@ -0,0 +1,8 @@ +szaft"Hello, World! +ugva"szaft +64 +//ez egy komment +/*ez +is*/420 +69+69 +ez egy identifier ha nem geny piszolj lőcs \ No newline at end of file