boxworks_lang/
lexer.rs

1//! Lexer and tokens for Box language.
2
3use super::Str;
4use crate::Error;
5use std::{borrow::Cow, rc::Rc};
6
7/// Box language lexer.
8pub struct Lexer<'a> {
9    /// The full source file being lexed.
10    s: &'a str,
11    /// Inclusive lower bound on the part of the file being lexed by this lexer.
12    l: usize,
13    /// Exclusive upper bound on the part of the file being lexed by this lexer.
14    u: usize,
15    /// Opening parens.
16    op: Rc<[Option<ClosingParen>]>,
17    /// Index of the next opening paren that is expected.
18    op_i: usize,
19    /// Error accumulator.
20    errs: super::ErrorAccumulator<'a>,
21}
22
23/// Opaque marker of a closing parenthesis.
24#[derive(Clone, Copy, Debug, PartialEq)]
25pub struct ClosingParen {
26    /// Index of the closing parenthesis in the source.
27    /// Because this closing paren matches an opening paren,
28    /// it cannot come at the start of the file and thus the index
29    /// is strictly bigger than 0.
30    source_idx: std::num::NonZeroUsize,
31    /// Starting index of parens after this closing paren in the parens array.
32    op_i: usize,
33}
34
35impl ClosingParen {
36    fn str<'a>(&self, source: &'a str) -> Str<'a> {
37        Str {
38            value: source,
39            start: self.source_idx.get(),
40            end: self.source_idx.get() + 1,
41        }
42    }
43}
44
45impl<'a> Lexer<'a> {
46    /// Create a new Box language lexer.
47    pub fn new(source: &'a str, errs: super::ErrorAccumulator<'a>) -> Self {
48        Self {
49            s: source,
50            l: 0,
51            u: source.len(),
52            op: Self::build(source),
53            op_i: 0,
54            errs,
55        }
56    }
57
58    /// Splits off a nested lexer.
59    pub fn split_nested(&mut self, closing_paren: Option<ClosingParen>) -> Self {
60        let inner = Self {
61            s: self.s,
62            l: self.l,
63            u: match closing_paren {
64                Some(c) => c.source_idx.get(),
65                None => self.u,
66            },
67            op: self.op.clone(),
68            op_i: self.op_i,
69            errs: self.errs.clone(),
70        };
71        (self.l, self.op_i) = match closing_paren {
72            Some(c) => (c.source_idx.get() + 1, c.op_i),
73            None => (self.u, self.op.len()),
74        };
75        inner
76    }
77
78    pub fn remaining_source(&self) -> Str<'a> {
79        Str {
80            value: self.s,
81            start: self.l,
82            end: self.u,
83        }
84    }
85    fn build(source: &'a str) -> Rc<[Option<ClosingParen>]> {
86        #[derive(Clone, Copy)]
87        enum State {
88            Regular,
89            Comment,
90            String,
91        }
92        struct Stack {
93            i: usize,
94        }
95        let mut v: Vec<Option<ClosingParen>> = vec![];
96        let mut stack = vec![];
97        let mut state = State::Regular;
98        let mut i = 0;
99        for c in source.chars() {
100            match (c, state) {
101                ('(' | '[', State::Regular) => {
102                    stack.push(Stack { i: v.len() });
103                    v.push(None);
104                }
105                (')' | ']', State::Regular) => {
106                    if let Some(s) = stack.pop() {
107                        v[s.i] = Some(ClosingParen {
108                            source_idx: i.try_into().expect("i>0 because this character is preceded by a [ or ( that pushed to the stack"),
109                            op_i: v.len(),
110                        });
111                    }
112                }
113                ('\n', State::Comment) => {
114                    state = State::Regular;
115                }
116                ('#', State::Regular) => {
117                    state = State::Comment;
118                }
119                ('"', State::Regular) => {
120                    state = State::String;
121                }
122                ('"', State::String) => {
123                    state = State::Regular;
124                }
125                _ => {}
126            }
127            i += c.len_utf8();
128        }
129        v.into()
130    }
131}
132
133/// A token in the Box language.
134#[derive(Clone, Debug)]
135pub struct Token<'a> {
136    pub value: TokenValue<'a>,
137    pub source: Str<'a>,
138}
139
140/// Value of a token in the Box language.
141#[derive(Clone, Debug, PartialEq)]
142pub enum TokenValue<'a> {
143    SquareOpen {
144        /// The closing bracket that matches this opening bracket.
145        ///
146        /// If `None`, this opening bracket is not matched.
147        /// If provided, the closing bracket may be either `)` or `]`.
148        closing: Option<ClosingParen>,
149    },
150    SquareClose,
151    /// Opening round bracket `(`.
152    RoundOpen {
153        /// The closing bracket that matches this opening bracket.
154        ///
155        /// If `None`, this opening bracket is not matched.
156        /// If provided, the closing bracket may be either `)` or `]`.
157        closing: Option<ClosingParen>,
158    },
159    RoundClose,
160    Comma,
161    Equal,
162    Keyword,
163    String(Cow<'a, str>),
164    Integer(i32),
165    Scaled(common::Scaled),
166    InfiniteGlue(common::Scaled, common::GlueOrder),
167    Comment,
168}
169
170impl<'a> Iterator for Lexer<'a> {
171    type Item = Token<'a>;
172
173    fn next(&mut self) -> Option<Token<'a>> {
174        // Consume whitespace and comments
175        let mut comment_start: Option<usize> = None;
176        while let Some(c) = self.s[self.l..self.u].chars().next() {
177            let should_skip = match c {
178                '\n' => {
179                    if let Some(comment_start) = comment_start.take() {
180                        return Some(Token {
181                            value: TokenValue::Comment,
182                            source: Str {
183                                value: self.s,
184                                start: comment_start,
185                                end: self.l,
186                            },
187                        });
188                    }
189                    true
190                }
191                '#' => {
192                    if comment_start.is_none() {
193                        comment_start = Some(self.l + 1);
194                    }
195                    true
196                }
197                c => comment_start.is_some() || c.is_whitespace(),
198            };
199            if !should_skip {
200                break;
201            }
202            self.l += c.len_utf8();
203        }
204        // Now look at the token
205        let mut iter = self.s[self.l..self.u].chars();
206        let c = iter.next()?;
207        let start = self.l;
208        self.l += c.len_utf8();
209        use TokenValue::*;
210        let value = match c {
211            '[' | '(' => {
212                let closing = self.op.get(self.op_i).cloned().flatten();
213                let open = Str {
214                    value: self.s,
215                    start,
216                    end: start + 1,
217                };
218                match &closing {
219                    Some(closing) => {
220                        let close = closing.str(self.s);
221                        let want = if c == '[' { "]" } else { ")" };
222                        if close.str() != want {
223                            self.errs.add(Error::MismatchedBraces { open, close });
224                        }
225                    }
226                    None => {
227                        self.errs.add(Error::UnmatchedOpeningBracket { open });
228                    }
229                };
230                self.op_i += 1;
231                if c == '[' {
232                    SquareOpen { closing }
233                } else {
234                    RoundOpen { closing }
235                }
236            }
237            ']' => SquareClose,
238            ')' => RoundClose,
239            '=' => Equal,
240            ',' => Comma,
241            'a'..='z' | 'A'..='Z' => {
242                while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
243                    self.l += n.len_utf8();
244                }
245                Keyword
246            }
247            '"' => {
248                // TODO: only allocate a buffer if we're going to use it
249                let mut buf: std::string::String = Default::default();
250                loop {
251                    let Some(n) = iter.next() else {
252                        // TODO: error in this case?
253                        return None;
254                    };
255                    self.l += n.len_utf8();
256                    match n {
257                        '"' => {
258                            break;
259                        }
260                        // Escape character
261                        //
262                        // We support a subset of Rust escape characters, which are documented
263                        // here: https://doc.rust-lang.org/reference/expressions/literal-expr.html.
264                        '\\' => {
265                            let Some(n) = iter.next() else {
266                                // TODO: error in this case?
267                                return None;
268                            };
269                            self.l += n.len_utf8();
270                            match n {
271                                '\"' | '\\' => {
272                                    buf.push(n);
273                                }
274                                'u' => {
275                                    if iter.next() != Some('{') {
276                                        // TODO error
277                                        continue;
278                                    }
279                                    self.l += '{'.len_utf8();
280                                    let mut i = 0;
281                                    let mut valid = true;
282                                    loop {
283                                        let Some(n) = iter.next() else {
284                                            // TODO: error in this case?
285                                            return None;
286                                        };
287                                        self.l += n.len_utf8();
288                                        if n == '}' {
289                                            // TODO: error if no number was provided.
290                                            break;
291                                        }
292                                        match n.to_digit(16) {
293                                            None => {
294                                                valid = false;
295                                            }
296                                            Some(d) => {
297                                                i = i * 16 + d;
298                                            }
299                                        }
300                                    }
301                                    if !valid {
302                                        // TODO: error
303                                        continue;
304                                    }
305                                    let Some(c) = char::from_u32(i) else {
306                                        // TODO: error
307                                        continue;
308                                    };
309                                    buf.push(c);
310                                }
311                                _ => {
312                                    // TODO: error for unexpected special character
313                                }
314                            }
315                        }
316                        _ => {
317                            buf.push(n);
318                        }
319                    }
320                }
321                // If the string is exactly in this source (e.g. no special control sequences)
322                // then we can avoid an allocation.
323                let source = &self.s[start + 1..self.l - 1];
324                String(if buf.len() == source.len() {
325                    Cow::Borrowed(source)
326                } else {
327                    Cow::Owned(buf)
328                })
329            }
330            '0'..='9' => {
331                let initial_value = (c as i32) - ('0' as i32);
332                self.parse_number(false, initial_value, start)
333            }
334            '-' => self.parse_number(true, 0, start),
335            _ => {
336                self.errs.add(Error::InvalidCharacter {
337                    char: Str {
338                        value: self.s,
339                        start,
340                        end: self.l,
341                    },
342                });
343                return self.next();
344            }
345        };
346        Some(Token {
347            value,
348            source: Str {
349                value: self.s,
350                start,
351                end: self.l,
352            },
353        })
354    }
355}
356
357impl<'a> Lexer<'a> {
358    fn parse_number(
359        &mut self,
360        negative: bool,
361        initial_value: i32,
362        start_idx: usize,
363    ) -> TokenValue<'a> {
364        let mut iter = self.s[self.l..self.u].chars();
365        let mut n = initial_value;
366        let mut parsing_n = true;
367        let mut d = [0_u8; 17];
368        let mut next_d = 0_usize;
369        loop {
370            match iter.next() {
371                Some(c @ '0'..='9') => {
372                    let i = (c as i32) - ('0' as i32);
373                    if parsing_n {
374                        n = n.checked_mul(10).unwrap();
375                        n = n.checked_add(i).unwrap();
376                    } else {
377                        if let Some(d) = d.get_mut(next_d) {
378                            *d = i.try_into().expect("i in [0,9]")
379                        }
380                        next_d += 1;
381                    }
382                    self.l += c.len_utf8();
383                }
384                Some(d @ '.') => {
385                    if !parsing_n {
386                        self.errs.add(Error::MultipleDecimalPoints {
387                            point: Str {
388                                value: self.s,
389                                start: self.l,
390                                end: self.l + d.len_utf8(),
391                            },
392                        });
393                    }
394                    parsing_n = false;
395                    self.l += d.len_utf8();
396                }
397                Some(c @ 'a'..='z' | c @ 'A'..='Z') => {
398                    let u = self.l;
399                    self.l += c.len_utf8();
400                    while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
401                        self.l += n.len_utf8();
402                    }
403
404                    let mut s = common::Scaled::from_decimal_digits(&d) + common::Scaled::ONE * n;
405                    if negative {
406                        s.0 *= -1;
407                    }
408                    let raw_unit = &self.s[u..self.l];
409                    if let Some(unit) = common::ScaledUnit::parse(raw_unit) {
410                        let mut s =
411                            common::Scaled::new(n, common::Scaled::from_decimal_digits(&d), unit)
412                                .unwrap();
413                        if negative {
414                            s = -s;
415                        }
416                        return TokenValue::Scaled(s);
417                    }
418                    if let Some(glue_order) = common::GlueOrder::parse(raw_unit) {
419                        return TokenValue::InfiniteGlue(s, glue_order);
420                    }
421                    self.errs.add(Error::InvalidDimensionUnit {
422                        dimension: Str {
423                            value: self.s,
424                            start: start_idx,
425                            end: self.l,
426                        },
427                        unit: Str {
428                            value: self.s,
429                            start: u,
430                            end: self.l,
431                        },
432                    });
433                    return TokenValue::Scaled(common::Scaled::ZERO);
434                }
435                d => {
436                    if !parsing_n {
437                        self.errs.add(Error::NumberWithoutUnits {
438                            number: Str {
439                                value: self.s,
440                                start: self.l,
441                                end: self.l + d.map(|c| c.len_utf8()).unwrap_or(0),
442                            },
443                        });
444                        return TokenValue::Scaled(common::Scaled::ZERO);
445                    }
446                    if negative {
447                        n *= -1;
448                    }
449                    return TokenValue::Integer(n);
450                }
451            }
452        }
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use crate::ErrorAccumulator;
459
460    use super::*;
461    fn run_lexer_test(input: &str, want: Vec<TokenValue>) {
462        let errs: ErrorAccumulator = Default::default();
463        let lexer = Lexer::new(&input, errs);
464
465        let got: Vec<TokenValue> = lexer.into_iter().map(|t| t.value).collect();
466
467        assert_eq!(got, want);
468    }
469
470    macro_rules! lexer_tests {
471        ( $( ($name: ident, $input: expr, $want: expr, ), )+ ) => {
472            $(
473                #[test]
474                fn $name() {
475                    let input = $input;
476                    let want = $want;
477                    run_lexer_test(input, want);
478                }
479            )+
480        };
481    }
482
483    lexer_tests!(
484        (
485            string_simple,
486            r#" "string" "#,
487            vec![TokenValue::String("string".into())],
488        ),
489        (
490            string_with_special_char_1,
491            r#" "\"" "#,
492            vec![TokenValue::String("\"".into())],
493        ),
494        (
495            string_with_special_char_2,
496            r#" "\\" "#,
497            vec![TokenValue::String("\\".into())],
498        ),
499        (
500            string_with_invalid_special_char,
501            r#" "\a" "#,
502            vec![TokenValue::String("".into())],
503        ),
504        (
505            string_with_unicode,
506            r#" "\u{100}", "second" "#,
507            vec![
508                TokenValue::String("\u{100}".into()),
509                TokenValue::Comma,
510                TokenValue::String("second".into()),
511            ],
512        ),
513    );
514}