1- use crate :: compiler :: { ast :: { Pos , Span } , CResult , CompilerError } ;
1+ // Source text -> Tokens
22
3+ use std:: { fs:: File , io:: Read , path:: Path } ;
4+ use crate :: compiler:: { ast:: { Pos , Span } , CompilerError , ToCompileResult } ;
35
46const CASE_SENSITIVITY : bool = true ;
57
8+ #[ derive( Debug , Clone ) ]
9+ pub struct LexError {
10+ pub span : Span ,
11+ pub message : String ,
12+ }
13+
14+ impl < T > ToCompileResult < T > for Result < T , Vec < LexError > > {
15+ fn into_cresult ( self ) -> Result < T , super :: CompilerError > {
16+ self . map_err ( |err| CompilerError :: LexError ( err) )
17+ }
18+ }
19+
20+
621#[ derive( Debug , Clone , PartialEq ) ]
722pub enum TokenKind {
823 // Identifiers and Literals
@@ -63,6 +78,9 @@ pub enum TokenKind {
6378 Dot , // .
6479
6580 // Misc
81+ Invalid ( String ) ,
82+ UnterminatedString ( String ) ,
83+
6684 EOF ,
6785}
6886
@@ -72,9 +90,19 @@ pub struct Token {
7290 pub span : Span ,
7391}
7492
93+ pub fn lex_file ( path : impl AsRef < Path > ) -> std:: io:: Result < Result < Vec < Token > , Vec < LexError > > > {
94+ let mut file = File :: open ( path) ?;
95+ let mut src = String :: new ( ) ;
96+ file. read_to_string ( & mut src) ?;
97+
98+ let mut lexer = Lexer :: new ( & src) ;
99+ Ok ( lexer. lex_all ( ) )
100+ }
101+
75102pub struct Lexer {
76103 src : Vec < char > ,
77104 pos : usize ,
105+ errors : Vec < LexError > ,
78106 line : usize ,
79107 col : usize ,
80108}
@@ -83,35 +111,40 @@ impl Lexer {
83111 Self {
84112 src : input. chars ( ) . collect ( ) ,
85113 pos : 0 ,
114+ errors : Vec :: new ( ) ,
86115 line : 1 ,
87116 col : 1 ,
88117 }
89118 }
90119
91- pub fn lex_all ( & mut self ) -> CResult < Vec < Token > > {
120+ pub fn lex_all ( & mut self ) -> Result < Vec < Token > , Vec < LexError > > {
92121 let mut tokens = Vec :: new ( ) ;
93122 loop {
94- let tok = self . next_token ( ) ? ;
123+ let tok = self . next_token ( ) ;
95124 if matches ! ( tok. kind, TokenKind :: EOF ) {
96125 tokens. push ( tok) ; break ;
97126 } else {
98127 tokens. push ( tok) ;
99128 }
100129 }
101- Ok ( tokens)
130+ if self . errors . is_empty ( ) {
131+ Ok ( tokens)
132+ } else {
133+ Err ( self . errors . clone ( ) )
134+ }
102135 }
103136
104- pub fn next_token ( & mut self ) -> CResult < Token > {
137+ pub fn next_token ( & mut self ) -> Token {
105138 self . skip_whitespace_and_comment ( ) ;
106139
107140 let start_line = self . line ;
108141 let start_col = self . col ;
109142
110143 let Some ( c) = self . peek ( ) else {
111- return Ok ( self . make_token ( TokenKind :: EOF , start_line, start_col) ) ;
144+ return self . make_token ( TokenKind :: EOF , start_line, start_col) ;
112145 } ;
113146
114- Ok ( if c. is_alphabetic ( ) {
147+ if c. is_alphabetic ( ) {
115148 self . lex_identifier_or_keyword ( start_line, start_col)
116149 } else if c. is_ascii_digit ( ) {
117150 self . lex_number ( start_line, start_col)
@@ -120,14 +153,18 @@ impl Lexer {
120153 } else if c == '-' && self . peek_ahead ( 1 ) . map_or ( false , |n| n. is_ascii_digit ( ) ) {
121154 self . lex_number ( start_line, start_col)
122155 } else if c == '"' {
123- self . lex_string ( start_line, start_col) ?
156+ self . lex_string ( start_line, start_col)
124157 } else {
125- self . lex_symbol ( start_line, start_col) ?
126- } )
158+ self . lex_symbol ( start_line, start_col)
159+ }
127160 }
128161
129162 fn make_token ( & self , kind : TokenKind , line : usize , col : usize ) -> Token {
130- Token { kind, span : Span { start : Pos { line, col } , end : Pos { line : self . line , col : self . col } } }
163+ Token { kind, span : self . make_span ( line, col) }
164+ }
165+
166+ fn make_span ( & self , line : usize , col : usize ) -> Span {
167+ Span { start : Pos { line, col } , end : Pos { line : self . line , col : self . col } }
131168 }
132169
133170 // --------- Iteration ---------
@@ -174,11 +211,25 @@ impl Lexer {
174211 }
175212 }
176213 Some ( '/' ) if self . peek_ahead ( 1 ) == Some ( '*' ) => {
177- self . advance ( ) ; self . advance ( ) ; // consume "/*"
178- while !( self . peek ( ) == Some ( '*' ) && self . peek_ahead ( 1 ) == Some ( '/' ) ) {
179- if self . advance ( ) . is_none ( ) { break ; }
214+ let start_line = self . line ;
215+ let start_col = self . col ;
216+ let mut block_count = 0 ;
217+ loop {
218+ if self . peek ( ) == Some ( '/' ) && self . peek_ahead ( 1 ) == Some ( '*' ) {
219+ self . advance ( ) ; self . advance ( ) ; // consume "/*"
220+ block_count += 1 ;
221+ }
222+ if self . peek ( ) == Some ( '*' ) && self . peek_ahead ( 1 ) == Some ( '/' ) {
223+ self . advance ( ) ; self . advance ( ) ; // consume "*/"
224+ block_count -= 1 ;
225+ }
226+ if block_count <= 0 { break ; }
227+
228+ if self . advance ( ) . is_none ( ) {
229+ self . error ( self . make_span ( start_line, start_col) , "Unterminated block comment" ) ;
230+ break ;
231+ }
180232 }
181- self . advance ( ) ; self . advance ( ) ; // consume "*/"
182233 }
183234 _ => break ,
184235 }
@@ -323,13 +374,12 @@ impl Lexer {
323374
324375 // --------- Strings ---------
325376
326- fn lex_string ( & mut self , line : usize , col : usize ) -> CResult < Token > {
377+ fn lex_string ( & mut self , line : usize , col : usize ) -> Token {
327378 self . advance ( ) ; // consume '"'
328379 let mut s = String :: new ( ) ;
329380
330381 loop {
331382 match self . advance ( ) {
332- None => return Err ( CompilerError :: LexerUnterminatedString { line, col } ) ,
333383 Some ( '"' ) => break ,
334384 Some ( '\\' ) => {
335385 if let Some ( escaped) = self . advance ( ) {
@@ -346,15 +396,19 @@ impl Lexer {
346396 }
347397 }
348398 Some ( c) => s. push ( c) ,
399+ None => {
400+ self . error ( self . make_span ( line, col) , "Unterminated string literal" ) ;
401+ return self . make_token ( TokenKind :: UnterminatedString ( s) , line, col) ;
402+ }
349403 }
350404 }
351-
352- Ok ( self . make_token ( TokenKind :: String ( s) , line, col) )
405+
406+ self . make_token ( TokenKind :: String ( s) , line, col)
353407 }
354408
355409 // --------- Symbols & Operators ---------
356410
357- fn lex_symbol ( & mut self , line : usize , col : usize ) -> CResult < Token > {
411+ fn lex_symbol ( & mut self , line : usize , col : usize ) -> Token {
358412 use TokenKind :: * ;
359413 let c = self . advance ( ) . unwrap ( ) ;
360414
@@ -390,10 +444,22 @@ impl Lexer {
390444 ':' => Colon ,
391445 '.' => Dot ,
392446 '_' => Underscore ,
393- _ => return Err ( CompilerError :: LexerUnexpectedChar { line, col, c } ) ,
447+ c => {
448+ self . error ( self . make_span ( line, col) , format ! ( "Unexpected character '{}'" , c) ) ;
449+ Invalid ( c. to_string ( ) )
450+ }
394451 } ;
395452
396- Ok ( self . make_token ( kind, line, col) )
453+ self . make_token ( kind, line, col)
454+ }
455+
456+ // --------- Errors ---------
457+
458+ fn error ( & mut self , span : Span , message : impl Into < String > ) {
459+ self . errors . push ( LexError {
460+ span,
461+ message : message. into ( ) ,
462+ } ) ;
397463 }
398464}
399465
@@ -417,12 +483,13 @@ mod tests {
417483 let mut lexer = Lexer :: new ( src) ;
418484 let mut i = 0 ;
419485 loop {
420- let tok = lexer. next_token ( ) . unwrap ( ) ;
486+ let tok = lexer. next_token ( ) ;
421487 println ! ( "{:?}" , tok) ;
422488 assert_eq ! ( tok. kind, expected[ i] ) ;
423489 if matches ! ( tok. kind, TokenKind :: EOF ) { break ; }
424490 i += 1 ;
425491 }
492+ assert ! ( lexer. errors. is_empty( ) , "ERRORS: {:?}" , lexer. errors) ;
426493 }
427494
428495 #[ test]
@@ -438,11 +505,34 @@ mod tests {
438505 let mut lexer = Lexer :: new ( src) ;
439506 let mut i = 0 ;
440507 loop {
441- let tok = lexer. next_token ( ) . unwrap ( ) ;
508+ let tok = lexer. next_token ( ) ;
509+ println ! ( "{:?}" , tok) ;
510+ assert_eq ! ( tok. kind, expected[ i] ) ;
511+ if matches ! ( tok. kind, TokenKind :: EOF ) { break ; }
512+ i += 1 ;
513+ }
514+ assert ! ( lexer. errors. is_empty( ) , "ERRORS: {:?}" , lexer. errors) ;
515+ }
516+
517+ #[ test]
518+ fn test_error ( ) {
519+ let src = r#"
520+ ° "adsjd
521+ "# ;
522+
523+ let expected = vec ! [
524+ TokenKind :: Invalid ( "°" . into( ) ) , TokenKind :: UnterminatedString ( "adsjd\n " . into( ) ) , TokenKind :: EOF
525+ ] ;
526+ let mut lexer = Lexer :: new ( src) ;
527+ let mut i = 0 ;
528+ loop {
529+ let tok = lexer. next_token ( ) ;
442530 println ! ( "{:?}" , tok) ;
443531 assert_eq ! ( tok. kind, expected[ i] ) ;
444532 if matches ! ( tok. kind, TokenKind :: EOF ) { break ; }
445533 i += 1 ;
446534 }
535+ assert_eq ! ( lexer. errors. len( ) , 2 , "Error length mismatch: {:?}" , lexer. errors) ;
536+ println ! ( "Errors: {:?}" , lexer. errors)
447537 }
448538}
0 commit comments