boxworks/lang/
lexer.rs

1//! Lexer and tokens for Box language.
2
3use super::Error;
4use super::Str;
5use std::{borrow::Cow, rc::Rc};
6
7/// Box language lexer.
8pub struct Lexer<'a> {
9    /// The full source file being lexed.
10    s: &'a str,
11    /// Inclusive lower bound on the part of the file being lexed by this lexer.
12    l: usize,
13    /// Exclusive upper bound on the part of the file being lexed by this lexer.
14    u: usize,
15    /// Opening parens.
16    op: Rc<[Option<ClosingParen>]>,
17    /// Index of the next opening paren that is expected.
18    op_i: usize,
19    /// Error accumulator.
20    errs: super::ErrorAccumulator<'a>,
21}
22
23/// Opaque marker of a closing parenthesis.
24#[derive(Clone, Copy, Debug, PartialEq)]
25pub struct ClosingParen {
26    /// Index of the closing parenthesis in the source.
27    /// Because this closing paren matches an opening paren,
28    /// it cannot come at the start of the file and thus the index
29    /// is strictly bigger than 0.
30    source_idx: std::num::NonZeroUsize,
31    /// Starting index of parens after this closing paren in the parens array.
32    op_i: usize,
33}
34
35impl ClosingParen {
36    fn str<'a>(&self, source: &'a str) -> Str<'a> {
37        Str {
38            value: source,
39            start: self.source_idx.get(),
40            end: self.source_idx.get() + 1,
41        }
42    }
43}
44
45impl<'a> Lexer<'a> {
46    /// Create a new Box language lexer.
47    pub fn new(source: &'a str, errs: super::ErrorAccumulator<'a>) -> Self {
48        Self {
49            s: source,
50            l: 0,
51            u: source.len(),
52            op: Self::build(source),
53            op_i: 0,
54            errs,
55        }
56    }
57
58    /// Splits off a nested lexer.
59    pub fn split_nested(&mut self, closing_paren: Option<ClosingParen>) -> Self {
60        let inner = Self {
61            s: self.s,
62            l: self.l,
63            u: match closing_paren {
64                Some(c) => c.source_idx.get(),
65                None => self.u,
66            },
67            op: self.op.clone(),
68            op_i: self.op_i,
69            errs: self.errs.clone(),
70        };
71        (self.l, self.op_i) = match closing_paren {
72            Some(c) => (c.source_idx.get() + 1, c.op_i),
73            None => (self.u, self.op.len()),
74        };
75        inner
76    }
77
78    pub fn remaining_source(&self) -> Str<'a> {
79        Str {
80            value: self.s,
81            start: self.l,
82            end: self.u,
83        }
84    }
85    fn build(source: &'a str) -> Rc<[Option<ClosingParen>]> {
86        #[derive(Clone, Copy)]
87        enum State {
88            Regular,
89            Comment,
90            String,
91        }
92        struct Stack {
93            i: usize,
94        }
95        let mut v: Vec<Option<ClosingParen>> = vec![];
96        let mut stack = vec![];
97        let mut state = State::Regular;
98        let mut i = 0;
99        for c in source.chars() {
100            match (c, state) {
101                ('(' | '[', State::Regular) => {
102                    stack.push(Stack { i: v.len() });
103                    v.push(None);
104                }
105                (')' | ']', State::Regular) => {
106                    if let Some(s) = stack.pop() {
107                        v[s.i] = Some(ClosingParen {
108                            source_idx: i.try_into().expect("i>0 because this character is preceded by a [ or ( that pushed to the stack"),
109                            op_i: v.len(),
110                        });
111                    }
112                }
113                ('\n', State::Comment) => {
114                    state = State::Regular;
115                }
116                ('#', State::Regular) => {
117                    state = State::Comment;
118                }
119                ('"', State::Regular) => {
120                    state = State::String;
121                }
122                ('"', State::String) => {
123                    state = State::Regular;
124                }
125                _ => {}
126            }
127            i += c.len_utf8();
128        }
129        v.into()
130    }
131}
132
133/// A token in the Box language.
134#[derive(Clone, Debug)]
135pub struct Token<'a> {
136    pub value: TokenValue<'a>,
137    pub source: Str<'a>,
138}
139
140/// Value of a token in the Box language.
141#[derive(Clone, Debug, PartialEq)]
142pub enum TokenValue<'a> {
143    SquareOpen {
144        /// The closing bracket that matches this opening bracket.
145        ///
146        /// If `None`, this opening bracket is not matched.
147        /// If provided, the closing bracket may be either `)` or `]`.
148        closing: Option<ClosingParen>,
149    },
150    SquareClose,
151    /// Opening round bracket `(`.
152    RoundOpen {
153        /// The closing bracket that matches this opening bracket.
154        ///
155        /// If `None`, this opening bracket is not matched.
156        /// If provided, the closing bracket may be either `)` or `]`.
157        closing: Option<ClosingParen>,
158    },
159    RoundClose,
160    Comma,
161    Equal,
162    Keyword,
163    String(Cow<'a, str>),
164    Integer(i32),
165    Scaled(common::Scaled),
166    InfiniteGlue(common::Scaled, common::GlueOrder),
167    Comment,
168}
169
170impl<'a> Iterator for Lexer<'a> {
171    type Item = Token<'a>;
172
173    fn next(&mut self) -> Option<Token<'a>> {
174        // Consume whitespace and comments
175        let mut comment_start: Option<usize> = None;
176        while let Some(c) = self.s[self.l..self.u].chars().next() {
177            let should_skip = match c {
178                '\n' => {
179                    if let Some(comment_start) = comment_start.take() {
180                        return Some(Token {
181                            value: TokenValue::Comment,
182                            source: Str {
183                                value: self.s,
184                                start: comment_start,
185                                end: self.l,
186                            },
187                        });
188                    }
189                    true
190                }
191                '#' => {
192                    if comment_start.is_none() {
193                        comment_start = Some(self.l + 1);
194                    }
195                    true
196                }
197                c => comment_start.is_some() || c.is_whitespace(),
198            };
199            if !should_skip {
200                break;
201            }
202            self.l += c.len_utf8();
203        }
204        // Now look at the token
205        let mut iter = self.s[self.l..self.u].chars();
206        let c = iter.next()?;
207        let start = self.l;
208        self.l += c.len_utf8();
209        use TokenValue::*;
210        let value = match c {
211            '[' | '(' => {
212                let closing = self.op.get(self.op_i).cloned().flatten();
213                let open = Str {
214                    value: self.s,
215                    start,
216                    end: start + 1,
217                };
218                match &closing {
219                    Some(closing) => {
220                        let close = closing.str(self.s);
221                        let want = if c == '[' { "]" } else { ")" };
222                        if close.str() != want {
223                            self.errs.add(Error::MismatchedBraces { open, close });
224                        }
225                    }
226                    None => {
227                        self.errs.add(Error::UnmatchedOpeningBracket { open });
228                    }
229                };
230                self.op_i += 1;
231                if c == '[' {
232                    SquareOpen { closing }
233                } else {
234                    RoundOpen { closing }
235                }
236            }
237            ']' => SquareClose,
238            ')' => RoundClose,
239            '=' => Equal,
240            ',' => Comma,
241            'a'..='z' | 'A'..='Z' => {
242                while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
243                    self.l += n.len_utf8();
244                }
245                Keyword
246            }
247            '"' => {
248                // TODO: only allocate a buffer if we're going to use it
249                let mut buf: std::string::String = Default::default();
250                loop {
251                    let Some(n) = iter.next() else {
252                        // TODO: error in this case?
253                        return None;
254                    };
255                    self.l += n.len_utf8();
256                    let c: char = match n {
257                        '"' => {
258                            break;
259                        }
260                        // Escape character
261                        //
262                        // We support a subset of Rust escape characters, which are documented
263                        // here: https://doc.rust-lang.org/reference/expressions/literal-expr.html.
264                        '\\' => {
265                            let Some(n) = iter.next() else {
266                                // TODO: error in this case?
267                                return None;
268                            };
269                            self.l += n.len_utf8();
270                            match n {
271                                '\"' | '\'' | '\\' => n,
272                                'n' => '\n',
273                                't' => '\t',
274                                '0' => '\0',
275                                'r' => '\r',
276                                'u' => {
277                                    if iter.next() != Some('{') {
278                                        // TODO error
279                                        continue;
280                                    }
281                                    self.l += '{'.len_utf8();
282                                    let mut i = 0;
283                                    let mut valid = true;
284                                    loop {
285                                        let Some(n) = iter.next() else {
286                                            // TODO: error in this case?
287                                            return None;
288                                        };
289                                        self.l += n.len_utf8();
290                                        if n == '}' {
291                                            // TODO: error if no number was provided.
292                                            break;
293                                        }
294                                        match n.to_digit(16) {
295                                            None => {
296                                                valid = false;
297                                            }
298                                            Some(d) => {
299                                                i = i * 16 + d;
300                                            }
301                                        }
302                                    }
303                                    if !valid {
304                                        // TODO: error
305                                        continue;
306                                    }
307                                    let Some(c) = char::from_u32(i) else {
308                                        // TODO: error
309                                        continue;
310                                    };
311                                    c
312                                }
313                                _ => {
314                                    self.errs.add(Error::UnknownEscapeSequence {
315                                        sequence: Str {
316                                            value: self.s,
317                                            start: self.l - n.len_utf8() - 1,
318                                            end: self.l,
319                                        },
320                                    });
321                                    continue;
322                                }
323                            }
324                        }
325                        _ => n,
326                    };
327                    buf.push(c);
328                }
329                // If the string is exactly in this source (e.g. no special control sequences)
330                // then we can avoid an allocation.
331                let source = &self.s[start + 1..self.l - 1];
332                String(if buf.len() == source.len() {
333                    Cow::Borrowed(source)
334                } else {
335                    Cow::Owned(buf)
336                })
337            }
338            '0'..='9' => {
339                let initial_value = (c as i32) - ('0' as i32);
340                self.parse_number(false, initial_value, start)
341            }
342            '-' => self.parse_number(true, 0, start),
343            _ => {
344                self.errs.add(Error::InvalidCharacter {
345                    char: Str {
346                        value: self.s,
347                        start,
348                        end: self.l,
349                    },
350                });
351                return self.next();
352            }
353        };
354        Some(Token {
355            value,
356            source: Str {
357                value: self.s,
358                start,
359                end: self.l,
360            },
361        })
362    }
363}
364
365impl<'a> Lexer<'a> {
366    fn parse_number(
367        &mut self,
368        negative: bool,
369        initial_value: i32,
370        start_idx: usize,
371    ) -> TokenValue<'a> {
372        let mut iter = self.s[self.l..self.u].chars();
373        let mut n = initial_value;
374        let mut parsing_n = true;
375        let mut d = [0_u8; 17];
376        let mut next_d = 0_usize;
377        loop {
378            match iter.next() {
379                Some(c @ '0'..='9') => {
380                    let i = (c as i32) - ('0' as i32);
381                    if parsing_n {
382                        n = n.checked_mul(10).unwrap();
383                        n = n.checked_add(i).unwrap();
384                    } else {
385                        if let Some(d) = d.get_mut(next_d) {
386                            *d = i.try_into().expect("i in [0,9]")
387                        }
388                        next_d += 1;
389                    }
390                    self.l += c.len_utf8();
391                }
392                Some(d @ '.') => {
393                    if !parsing_n {
394                        self.errs.add(Error::MultipleDecimalPoints {
395                            point: Str {
396                                value: self.s,
397                                start: self.l,
398                                end: self.l + d.len_utf8(),
399                            },
400                        });
401                    }
402                    parsing_n = false;
403                    self.l += d.len_utf8();
404                }
405                Some(c @ 'a'..='z' | c @ 'A'..='Z') => {
406                    let u = self.l;
407                    self.l += c.len_utf8();
408                    while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
409                        self.l += n.len_utf8();
410                    }
411
412                    let mut s = common::Scaled::from_decimal_digits(&d) + common::Scaled::ONE * n;
413                    if negative {
414                        s.0 *= -1;
415                    }
416                    let raw_unit = &self.s[u..self.l];
417                    if let Some(unit) = common::ScaledUnit::parse(raw_unit) {
418                        let mut s =
419                            common::Scaled::new(n, common::Scaled::from_decimal_digits(&d), unit)
420                                .unwrap();
421                        if negative {
422                            s = -s;
423                        }
424                        return TokenValue::Scaled(s);
425                    }
426                    if let Some(glue_order) = common::GlueOrder::parse(raw_unit) {
427                        return TokenValue::InfiniteGlue(s, glue_order);
428                    }
429                    self.errs.add(Error::InvalidDimensionUnit {
430                        dimension: Str {
431                            value: self.s,
432                            start: start_idx,
433                            end: self.l,
434                        },
435                        unit: Str {
436                            value: self.s,
437                            start: u,
438                            end: self.l,
439                        },
440                    });
441                    return TokenValue::Scaled(common::Scaled::ZERO);
442                }
443                d => {
444                    if !parsing_n {
445                        self.errs.add(Error::NumberWithoutUnits {
446                            number: Str {
447                                value: self.s,
448                                start: self.l,
449                                end: self.l + d.map(|c| c.len_utf8()).unwrap_or(0),
450                            },
451                        });
452                        return TokenValue::Scaled(common::Scaled::ZERO);
453                    }
454                    if negative {
455                        n *= -1;
456                    }
457                    return TokenValue::Integer(n);
458                }
459            }
460        }
461    }
462}
463
464#[cfg(test)]
465mod tests {
466    use super::super::ErrorAccumulator;
467
468    use super::*;
469    fn run_lexer_test(input: &str, want: Vec<TokenValue>) {
470        let errs: ErrorAccumulator = Default::default();
471        let lexer = Lexer::new(&input, errs);
472
473        let got: Vec<TokenValue> = lexer.into_iter().map(|t| t.value).collect();
474
475        assert_eq!(got, want);
476    }
477
478    macro_rules! lexer_tests {
479        ( $( ($name: ident, $input: expr, $want: expr, ), )+ ) => {
480            $(
481                #[test]
482                fn $name() {
483                    let input = $input;
484                    let want = $want;
485                    run_lexer_test(input, want);
486                }
487            )+
488        };
489    }
490
491    lexer_tests!(
492        (
493            string_simple,
494            r#" "string" "#,
495            vec![TokenValue::String("string".into())],
496        ),
497        (
498            string_with_special_char_1,
499            r#" "\"" "#,
500            vec![TokenValue::String("\"".into())],
501        ),
502        (
503            string_with_special_char_2,
504            r#" "\\" "#,
505            vec![TokenValue::String("\\".into())],
506        ),
507        (
508            string_with_invalid_special_char,
509            r#" "\a" "#,
510            vec![TokenValue::String("".into())],
511        ),
512        (
513            string_with_unicode,
514            r#" "\u{100}", "second" "#,
515            vec![
516                TokenValue::String("\u{100}".into()),
517                TokenValue::Comma,
518                TokenValue::String("second".into()),
519            ],
520        ),
521    );
522}