1use super::Str;
4use crate::Error;
5use std::{borrow::Cow, rc::Rc};
6
7pub struct Lexer<'a> {
9 s: &'a str,
11 l: usize,
13 u: usize,
15 op: Rc<[Option<ClosingParen>]>,
17 op_i: usize,
19 errs: super::ErrorAccumulator<'a>,
21}
22
23#[derive(Clone, Copy, Debug, PartialEq)]
25pub struct ClosingParen {
26 source_idx: std::num::NonZeroUsize,
31 op_i: usize,
33}
34
35impl ClosingParen {
36 fn str<'a>(&self, source: &'a str) -> Str<'a> {
37 Str {
38 value: source,
39 start: self.source_idx.get(),
40 end: self.source_idx.get() + 1,
41 }
42 }
43}
44
45impl<'a> Lexer<'a> {
46 pub fn new(source: &'a str, errs: super::ErrorAccumulator<'a>) -> Self {
48 Self {
49 s: source,
50 l: 0,
51 u: source.len(),
52 op: Self::build(source),
53 op_i: 0,
54 errs,
55 }
56 }
57
58 pub fn split_nested(&mut self, closing_paren: Option<ClosingParen>) -> Self {
60 let inner = Self {
61 s: self.s,
62 l: self.l,
63 u: match closing_paren {
64 Some(c) => c.source_idx.get(),
65 None => self.u,
66 },
67 op: self.op.clone(),
68 op_i: self.op_i,
69 errs: self.errs.clone(),
70 };
71 (self.l, self.op_i) = match closing_paren {
72 Some(c) => (c.source_idx.get() + 1, c.op_i),
73 None => (self.u, self.op.len()),
74 };
75 inner
76 }
77
78 pub fn remaining_source(&self) -> Str<'a> {
79 Str {
80 value: self.s,
81 start: self.l,
82 end: self.u,
83 }
84 }
85 fn build(source: &'a str) -> Rc<[Option<ClosingParen>]> {
86 #[derive(Clone, Copy)]
87 enum State {
88 Regular,
89 Comment,
90 String,
91 }
92 struct Stack {
93 i: usize,
94 }
95 let mut v: Vec<Option<ClosingParen>> = vec![];
96 let mut stack = vec![];
97 let mut state = State::Regular;
98 let mut i = 0;
99 for c in source.chars() {
100 match (c, state) {
101 ('(' | '[', State::Regular) => {
102 stack.push(Stack { i: v.len() });
103 v.push(None);
104 }
105 (')' | ']', State::Regular) => {
106 if let Some(s) = stack.pop() {
107 v[s.i] = Some(ClosingParen {
108 source_idx: i.try_into().expect("i>0 because this character is preceded by a [ or ( that pushed to the stack"),
109 op_i: v.len(),
110 });
111 }
112 }
113 ('\n', State::Comment) => {
114 state = State::Regular;
115 }
116 ('#', State::Regular) => {
117 state = State::Comment;
118 }
119 ('"', State::Regular) => {
120 state = State::String;
121 }
122 ('"', State::String) => {
123 state = State::Regular;
124 }
125 _ => {}
126 }
127 i += c.len_utf8();
128 }
129 v.into()
130 }
131}
132
133#[derive(Clone, Debug)]
135pub struct Token<'a> {
136 pub value: TokenValue<'a>,
137 pub source: Str<'a>,
138}
139
140#[derive(Clone, Debug, PartialEq)]
142pub enum TokenValue<'a> {
143 SquareOpen {
144 closing: Option<ClosingParen>,
149 },
150 SquareClose,
151 RoundOpen {
153 closing: Option<ClosingParen>,
158 },
159 RoundClose,
160 Comma,
161 Equal,
162 Keyword,
163 String(Cow<'a, str>),
164 Integer(i32),
165 Scaled(common::Scaled),
166 InfiniteGlue(common::Scaled, common::GlueOrder),
167 Comment,
168}
169
170impl<'a> Iterator for Lexer<'a> {
171 type Item = Token<'a>;
172
173 fn next(&mut self) -> Option<Token<'a>> {
174 let mut comment_start: Option<usize> = None;
176 while let Some(c) = self.s[self.l..self.u].chars().next() {
177 let should_skip = match c {
178 '\n' => {
179 if let Some(comment_start) = comment_start.take() {
180 return Some(Token {
181 value: TokenValue::Comment,
182 source: Str {
183 value: self.s,
184 start: comment_start,
185 end: self.l,
186 },
187 });
188 }
189 true
190 }
191 '#' => {
192 if comment_start.is_none() {
193 comment_start = Some(self.l + 1);
194 }
195 true
196 }
197 c => comment_start.is_some() || c.is_whitespace(),
198 };
199 if !should_skip {
200 break;
201 }
202 self.l += c.len_utf8();
203 }
204 let mut iter = self.s[self.l..self.u].chars();
206 let c = iter.next()?;
207 let start = self.l;
208 self.l += c.len_utf8();
209 use TokenValue::*;
210 let value = match c {
211 '[' | '(' => {
212 let closing = self.op.get(self.op_i).cloned().flatten();
213 let open = Str {
214 value: self.s,
215 start,
216 end: start + 1,
217 };
218 match &closing {
219 Some(closing) => {
220 let close = closing.str(self.s);
221 let want = if c == '[' { "]" } else { ")" };
222 if close.str() != want {
223 self.errs.add(Error::MismatchedBraces { open, close });
224 }
225 }
226 None => {
227 self.errs.add(Error::UnmatchedOpeningBracket { open });
228 }
229 };
230 self.op_i += 1;
231 if c == '[' {
232 SquareOpen { closing }
233 } else {
234 RoundOpen { closing }
235 }
236 }
237 ']' => SquareClose,
238 ')' => RoundClose,
239 '=' => Equal,
240 ',' => Comma,
241 'a'..='z' | 'A'..='Z' => {
242 while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
243 self.l += n.len_utf8();
244 }
245 Keyword
246 }
247 '"' => {
248 let mut buf: std::string::String = Default::default();
250 loop {
251 let Some(n) = iter.next() else {
252 return None;
254 };
255 self.l += n.len_utf8();
256 match n {
257 '"' => {
258 break;
259 }
260 '\\' => {
265 let Some(n) = iter.next() else {
266 return None;
268 };
269 self.l += n.len_utf8();
270 match n {
271 '\"' | '\\' => {
272 buf.push(n);
273 }
274 'u' => {
275 if iter.next() != Some('{') {
276 continue;
278 }
279 self.l += '{'.len_utf8();
280 let mut i = 0;
281 let mut valid = true;
282 loop {
283 let Some(n) = iter.next() else {
284 return None;
286 };
287 self.l += n.len_utf8();
288 if n == '}' {
289 break;
291 }
292 match n.to_digit(16) {
293 None => {
294 valid = false;
295 }
296 Some(d) => {
297 i = i * 16 + d;
298 }
299 }
300 }
301 if !valid {
302 continue;
304 }
305 let Some(c) = char::from_u32(i) else {
306 continue;
308 };
309 buf.push(c);
310 }
311 _ => {
312 }
314 }
315 }
316 _ => {
317 buf.push(n);
318 }
319 }
320 }
321 let source = &self.s[start + 1..self.l - 1];
324 String(if buf.len() == source.len() {
325 Cow::Borrowed(source)
326 } else {
327 Cow::Owned(buf)
328 })
329 }
330 '0'..='9' => {
331 let initial_value = (c as i32) - ('0' as i32);
332 self.parse_number(false, initial_value, start)
333 }
334 '-' => self.parse_number(true, 0, start),
335 _ => {
336 self.errs.add(Error::InvalidCharacter {
337 char: Str {
338 value: self.s,
339 start,
340 end: self.l,
341 },
342 });
343 return self.next();
344 }
345 };
346 Some(Token {
347 value,
348 source: Str {
349 value: self.s,
350 start,
351 end: self.l,
352 },
353 })
354 }
355}
356
357impl<'a> Lexer<'a> {
358 fn parse_number(
359 &mut self,
360 negative: bool,
361 initial_value: i32,
362 start_idx: usize,
363 ) -> TokenValue<'a> {
364 let mut iter = self.s[self.l..self.u].chars();
365 let mut n = initial_value;
366 let mut parsing_n = true;
367 let mut d = [0_u8; 17];
368 let mut next_d = 0_usize;
369 loop {
370 match iter.next() {
371 Some(c @ '0'..='9') => {
372 let i = (c as i32) - ('0' as i32);
373 if parsing_n {
374 n = n.checked_mul(10).unwrap();
375 n = n.checked_add(i).unwrap();
376 } else {
377 if let Some(d) = d.get_mut(next_d) {
378 *d = i.try_into().expect("i in [0,9]")
379 }
380 next_d += 1;
381 }
382 self.l += c.len_utf8();
383 }
384 Some(d @ '.') => {
385 if !parsing_n {
386 self.errs.add(Error::MultipleDecimalPoints {
387 point: Str {
388 value: self.s,
389 start: self.l,
390 end: self.l + d.len_utf8(),
391 },
392 });
393 }
394 parsing_n = false;
395 self.l += d.len_utf8();
396 }
397 Some(c @ 'a'..='z' | c @ 'A'..='Z') => {
398 let u = self.l;
399 self.l += c.len_utf8();
400 while let Some(n @ 'a'..='z' | n @ 'A'..='Z' | n @ '_') = iter.next() {
401 self.l += n.len_utf8();
402 }
403
404 let mut s = common::Scaled::from_decimal_digits(&d) + common::Scaled::ONE * n;
405 if negative {
406 s.0 *= -1;
407 }
408 let raw_unit = &self.s[u..self.l];
409 if let Some(unit) = common::ScaledUnit::parse(raw_unit) {
410 let mut s =
411 common::Scaled::new(n, common::Scaled::from_decimal_digits(&d), unit)
412 .unwrap();
413 if negative {
414 s = -s;
415 }
416 return TokenValue::Scaled(s);
417 }
418 if let Some(glue_order) = common::GlueOrder::parse(raw_unit) {
419 return TokenValue::InfiniteGlue(s, glue_order);
420 }
421 self.errs.add(Error::InvalidDimensionUnit {
422 dimension: Str {
423 value: self.s,
424 start: start_idx,
425 end: self.l,
426 },
427 unit: Str {
428 value: self.s,
429 start: u,
430 end: self.l,
431 },
432 });
433 return TokenValue::Scaled(common::Scaled::ZERO);
434 }
435 d => {
436 if !parsing_n {
437 self.errs.add(Error::NumberWithoutUnits {
438 number: Str {
439 value: self.s,
440 start: self.l,
441 end: self.l + d.map(|c| c.len_utf8()).unwrap_or(0),
442 },
443 });
444 return TokenValue::Scaled(common::Scaled::ZERO);
445 }
446 if negative {
447 n *= -1;
448 }
449 return TokenValue::Integer(n);
450 }
451 }
452 }
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use crate::ErrorAccumulator;
459
460 use super::*;
461 fn run_lexer_test(input: &str, want: Vec<TokenValue>) {
462 let errs: ErrorAccumulator = Default::default();
463 let lexer = Lexer::new(&input, errs);
464
465 let got: Vec<TokenValue> = lexer.into_iter().map(|t| t.value).collect();
466
467 assert_eq!(got, want);
468 }
469
470 macro_rules! lexer_tests {
471 ( $( ($name: ident, $input: expr, $want: expr, ), )+ ) => {
472 $(
473 #[test]
474 fn $name() {
475 let input = $input;
476 let want = $want;
477 run_lexer_test(input, want);
478 }
479 )+
480 };
481 }
482
483 lexer_tests!(
484 (
485 string_simple,
486 r#" "string" "#,
487 vec![TokenValue::String("string".into())],
488 ),
489 (
490 string_with_special_char_1,
491 r#" "\"" "#,
492 vec![TokenValue::String("\"".into())],
493 ),
494 (
495 string_with_special_char_2,
496 r#" "\\" "#,
497 vec![TokenValue::String("\\".into())],
498 ),
499 (
500 string_with_invalid_special_char,
501 r#" "\a" "#,
502 vec![TokenValue::String("".into())],
503 ),
504 (
505 string_with_unicode,
506 r#" "\u{100}", "second" "#,
507 vec![
508 TokenValue::String("\u{100}".into()),
509 TokenValue::Comma,
510 TokenValue::String("second".into()),
511 ],
512 ),
513 );
514}