1use super::Error;
4use super::Str;
5use std::{borrow::Cow, rc::Rc};
6
7pub struct Lexer<'a> {
9 s: &'a str,
11 l: usize,
13 u: usize,
15 op: Rc<[Option<ClosingParen>]>,
17 op_i: usize,
19 errs: super::ErrorAccumulator<'a>,
21}
22
23#[derive(Clone, Copy, Debug, PartialEq)]
25pub struct ClosingParen {
26 source_idx: std::num::NonZeroUsize,
31 op_i: usize,
33}
34
35impl ClosingParen {
36 fn str<'a>(&self, source: &'a str) -> Str<'a> {
37 Str {
38 value: source,
39 start: self.source_idx.get(),
40 end: self.source_idx.get() + 1,
41 }
42 }
43}
44
45impl<'a> Lexer<'a> {
46 pub fn new(source: &'a str, errs: super::ErrorAccumulator<'a>) -> Self {
48 Self {
49 s: source,
50 l: 0,
51 u: source.len(),
52 op: Self::build(source),
53 op_i: 0,
54 errs,
55 }
56 }
57
58 pub fn split_nested(&mut self, closing_paren: Option<ClosingParen>) -> Self {
60 let inner = Self {
61 s: self.s,
62 l: self.l,
63 u: match closing_paren {
64 Some(c) => c.source_idx.get(),
65 None => self.u,
66 },
67 op: self.op.clone(),
68 op_i: self.op_i,
69 errs: self.errs.clone(),
70 };
71 (self.l, self.op_i) = match closing_paren {
72 Some(c) => (c.source_idx.get() + 1, c.op_i),
73 None => (self.u, self.op.len()),
74 };
75 inner
76 }
77
78 pub fn remaining_source(&self) -> Str<'a> {
79 Str {
80 value: self.s,
81 start: self.l,
82 end: self.u,
83 }
84 }
85 fn build(source: &'a str) -> Rc<[Option<ClosingParen>]> {
86 #[derive(Clone, Copy)]
87 enum State {
88 Regular,
89 Comment,
90 String,
91 }
92 struct Stack {
93 i: usize,
94 }
95 let mut v: Vec<Option<ClosingParen>> = vec![];
96 let mut stack = vec![];
97 let mut state = State::Regular;
98 let mut i = 0;
99 for c in source.chars() {
100 match (c, state) {
101 ('(' | '[', State::Regular) => {
102 stack.push(Stack { i: v.len() });
103 v.push(None);
104 }
105 (')' | ']', State::Regular) => {
106 if let Some(s) = stack.pop() {
107 v[s.i] = Some(ClosingParen {
108 source_idx: i.try_into().expect("i>0 because this character is preceded by a [ or ( that pushed to the stack"),
109 op_i: v.len(),
110 });
111 }
112 }
113 ('\n', State::Comment) => {
114 state = State::Regular;
115 }
116 ('#', State::Regular) => {
117 state = State::Comment;
118 }
119 ('"', State::Regular) => {
120 state = State::String;
121 }
122 ('"', State::String) => {
123 state = State::Regular;
124 }
125 _ => {}
126 }
127 i += c.len_utf8();
128 }
129 v.into()
130 }
131}
132
133#[derive(Clone, Debug)]
135pub struct Token<'a> {
136 pub value: TokenValue<'a>,
137 pub source: Str<'a>,
138}
139
140#[derive(Clone, Debug, PartialEq)]
142pub enum TokenValue<'a> {
143 SquareOpen {
144 closing: Option<ClosingParen>,
149 },
150 SquareClose,
151 RoundOpen {
153 closing: Option<ClosingParen>,
158 },
159 RoundClose,
160 Comma,
161 Equal,
162 Keyword,
163 String(Cow<'a, str>),
164 Integer(i32),
165 Scaled(common::Scaled),
166 InfiniteGlue(common::Scaled, common::GlueOrder),
167 Comment,
168}
169
170impl<'a> Iterator for Lexer<'a> {
171 type Item = Token<'a>;
172
173 fn next(&mut self) -> Option<Token<'a>> {
174 let mut comment_start: Option<usize> = None;
176 while let Some(c) = self.s[self.l..self.u].chars().next() {
177 let should_skip = match c {
178 '\n' => {
179 if let Some(comment_start) = comment_start.take() {
180 return Some(Token {
181 value: TokenValue::Comment,
182 source: Str {
183 value: self.s,
184 start: comment_start,
185 end: self.l,
186 },
187 });
188 }
189 true
190 }
191 '#' => {
192 if comment_start.is_none() {
193 comment_start = Some(self.l + 1);
194 }
195 true
196 }
197 c => comment_start.is_some() || c.is_whitespace(),
198 };
199 if !should_skip {
200 break;
201 }
202 self.l += c.len_utf8();
203 }
204 let mut iter = self.s[self.l..self.u].chars();
206 let c = iter.next()?;
207 let start = self.l;
208 self.l += c.len_utf8();
209 use TokenValue::*;
210 let value = match c {
211 '[' | '(' => {
212 let closing = self.op.get(self.op_i).cloned().flatten();
213 let open = Str {
214 value: self.s,
215 start,
216 end: start + 1,
217 };
218 match &closing {
219 Some(closing) => {
220 let close = closing.str(self.s);
221 let want = if c == '[' { "]" } else { ")" };
222 if close.str() != want {
223 self.errs.add(Error::MismatchedBraces { open, close });
224 }
225 }
226 None => {
227 self.errs.add(Error::UnmatchedOpeningBracket { open });
228 }
229 };
230 self.op_i += 1;
231 if c == '[' {
232 SquareOpen { closing }
233 } else {
234 RoundOpen { closing }
235 }
236 }
237 ']' => SquareClose,
238 ')' => RoundClose,
239 '=' => Equal,
240 ',' => Comma,
241 'a'..='z' | 'A'..='Z' => {
242 while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
243 self.l += n.len_utf8();
244 }
245 Keyword
246 }
247 '"' => {
248 let mut buf: std::string::String = Default::default();
250 loop {
251 let Some(n) = iter.next() else {
252 return None;
254 };
255 self.l += n.len_utf8();
256 let c: char = match n {
257 '"' => {
258 break;
259 }
260 '\\' => {
265 let Some(n) = iter.next() else {
266 return None;
268 };
269 self.l += n.len_utf8();
270 match n {
271 '\"' | '\'' | '\\' => n,
272 'n' => '\n',
273 't' => '\t',
274 '0' => '\0',
275 'r' => '\r',
276 'u' => {
277 if iter.next() != Some('{') {
278 continue;
280 }
281 self.l += '{'.len_utf8();
282 let mut i = 0;
283 let mut valid = true;
284 loop {
285 let Some(n) = iter.next() else {
286 return None;
288 };
289 self.l += n.len_utf8();
290 if n == '}' {
291 break;
293 }
294 match n.to_digit(16) {
295 None => {
296 valid = false;
297 }
298 Some(d) => {
299 i = i * 16 + d;
300 }
301 }
302 }
303 if !valid {
304 continue;
306 }
307 let Some(c) = char::from_u32(i) else {
308 continue;
310 };
311 c
312 }
313 _ => {
314 self.errs.add(Error::UnknownEscapeSequence {
315 sequence: Str {
316 value: self.s,
317 start: self.l - n.len_utf8() - 1,
318 end: self.l,
319 },
320 });
321 continue;
322 }
323 }
324 }
325 _ => n,
326 };
327 buf.push(c);
328 }
329 let source = &self.s[start + 1..self.l - 1];
332 String(if buf.len() == source.len() {
333 Cow::Borrowed(source)
334 } else {
335 Cow::Owned(buf)
336 })
337 }
338 '0'..='9' => {
339 let initial_value = (c as i32) - ('0' as i32);
340 self.parse_number(false, initial_value, start)
341 }
342 '-' => self.parse_number(true, 0, start),
343 _ => {
344 self.errs.add(Error::InvalidCharacter {
345 char: Str {
346 value: self.s,
347 start,
348 end: self.l,
349 },
350 });
351 return self.next();
352 }
353 };
354 Some(Token {
355 value,
356 source: Str {
357 value: self.s,
358 start,
359 end: self.l,
360 },
361 })
362 }
363}
364
365impl<'a> Lexer<'a> {
366 fn parse_number(
367 &mut self,
368 negative: bool,
369 initial_value: i32,
370 start_idx: usize,
371 ) -> TokenValue<'a> {
372 let mut iter = self.s[self.l..self.u].chars();
373 let mut n = initial_value;
374 let mut parsing_n = true;
375 let mut d = [0_u8; 17];
376 let mut next_d = 0_usize;
377 loop {
378 match iter.next() {
379 Some(c @ '0'..='9') => {
380 let i = (c as i32) - ('0' as i32);
381 if parsing_n {
382 n = n.checked_mul(10).unwrap();
383 n = n.checked_add(i).unwrap();
384 } else {
385 if let Some(d) = d.get_mut(next_d) {
386 *d = i.try_into().expect("i in [0,9]")
387 }
388 next_d += 1;
389 }
390 self.l += c.len_utf8();
391 }
392 Some(d @ '.') => {
393 if !parsing_n {
394 self.errs.add(Error::MultipleDecimalPoints {
395 point: Str {
396 value: self.s,
397 start: self.l,
398 end: self.l + d.len_utf8(),
399 },
400 });
401 }
402 parsing_n = false;
403 self.l += d.len_utf8();
404 }
405 Some(c @ 'a'..='z' | c @ 'A'..='Z') => {
406 let u = self.l;
407 self.l += c.len_utf8();
408 while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
409 self.l += n.len_utf8();
410 }
411
412 let mut s = common::Scaled::from_decimal_digits(&d) + common::Scaled::ONE * n;
413 if negative {
414 s.0 *= -1;
415 }
416 let raw_unit = &self.s[u..self.l];
417 if let Some(unit) = common::ScaledUnit::parse(raw_unit) {
418 let mut s =
419 common::Scaled::new(n, common::Scaled::from_decimal_digits(&d), unit)
420 .unwrap();
421 if negative {
422 s = -s;
423 }
424 return TokenValue::Scaled(s);
425 }
426 if let Some(glue_order) = common::GlueOrder::parse(raw_unit) {
427 return TokenValue::InfiniteGlue(s, glue_order);
428 }
429 self.errs.add(Error::InvalidDimensionUnit {
430 dimension: Str {
431 value: self.s,
432 start: start_idx,
433 end: self.l,
434 },
435 unit: Str {
436 value: self.s,
437 start: u,
438 end: self.l,
439 },
440 });
441 return TokenValue::Scaled(common::Scaled::ZERO);
442 }
443 d => {
444 if !parsing_n {
445 self.errs.add(Error::NumberWithoutUnits {
446 number: Str {
447 value: self.s,
448 start: self.l,
449 end: self.l + d.map(|c| c.len_utf8()).unwrap_or(0),
450 },
451 });
452 return TokenValue::Scaled(common::Scaled::ZERO);
453 }
454 if negative {
455 n *= -1;
456 }
457 return TokenValue::Integer(n);
458 }
459 }
460 }
461 }
462}
463
464#[cfg(test)]
465mod tests {
466 use super::super::ErrorAccumulator;
467
468 use super::*;
469 fn run_lexer_test(input: &str, want: Vec<TokenValue>) {
470 let errs: ErrorAccumulator = Default::default();
471 let lexer = Lexer::new(&input, errs);
472
473 let got: Vec<TokenValue> = lexer.into_iter().map(|t| t.value).collect();
474
475 assert_eq!(got, want);
476 }
477
478 macro_rules! lexer_tests {
479 ( $( ($name: ident, $input: expr, $want: expr, ), )+ ) => {
480 $(
481 #[test]
482 fn $name() {
483 let input = $input;
484 let want = $want;
485 run_lexer_test(input, want);
486 }
487 )+
488 };
489 }
490
491 lexer_tests!(
492 (
493 string_simple,
494 r#" "string" "#,
495 vec![TokenValue::String("string".into())],
496 ),
497 (
498 string_with_special_char_1,
499 r#" "\"" "#,
500 vec![TokenValue::String("\"".into())],
501 ),
502 (
503 string_with_special_char_2,
504 r#" "\\" "#,
505 vec![TokenValue::String("\\".into())],
506 ),
507 (
508 string_with_invalid_special_char,
509 r#" "\a" "#,
510 vec![TokenValue::String("".into())],
511 ),
512 (
513 string_with_unicode,
514 r#" "\u{100}", "second" "#,
515 vec![
516 TokenValue::String("\u{100}".into()),
517 TokenValue::Comma,
518 TokenValue::String("second".into()),
519 ],
520 ),
521 );
522}