texlang/parse/
integer.rs

1//! Number parsing.
2//!
3//! The number may be octal, decimal, hexadecimal, cast from a character token, or read
4//! from an internal registers. The full definition of a number in the TeX grammar
5//! is given on page X of the TeXBook.
6
7use crate::prelude as txl;
8use crate::token::{CommandRef, Value};
9use crate::traits::*;
10use crate::*;
11
12impl Parsable for i32 {
13    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
14        let (_, i, _) = parse_integer(input)?;
15        Ok(i)
16    }
17}
18
19/// When parsed, this type returns a nonnegative integer with the provided upper bound.
20///
21/// This type is used to implement the following parsing logic in TeX:
22///
23/// - TeX.2021.433 (scan_eight_bit_int) where N=256.
24/// - TeX.2021.434 (scan_char_num) where N=256.
25/// - TeX.2021.435 (scan_four_bit_int) where N=16.
26/// - TeX.2021.436 (scan_fifteen_bit_int) where N=2^15.
27/// - TeX.2021.437 (scan_twenty_seven_bit_int) where N=2^27.
28#[derive(Debug, PartialEq, Eq, Default)]
29pub struct Uint<const N: usize>(pub usize);
30
31impl Uint<0> {
32    pub const MAX: usize = i32::MAX as usize;
33}
34
35impl<const N: usize> Parsable for Uint<N> {
36    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
37        let (first_token, i, _) = parse_integer(input)?;
38        if i < 0 || i as usize >= N {
39            input.error(OutOfBoundsError::<N> {
40                first_token,
41                got: i,
42            })?;
43            Ok(Uint(0))
44        } else {
45            Ok(Uint(i as usize))
46        }
47    }
48}
49
50#[derive(Debug)]
51struct OutOfBoundsError<const N: usize> {
52    first_token: token::Token,
53    got: i32,
54}
55
56impl<const N: usize> error::TexError for OutOfBoundsError<N> {
57    fn kind(&self) -> error::Kind {
58        error::Kind::Token(self.first_token)
59    }
60
61    fn title(&self) -> String {
62        format!(
63            "expected an integer in the range [0, {}), got {}",
64            N, self.got
65        )
66    }
67}
68
69impl Parsable for char {
70    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
71        let u1 = Uint::<{ char::MAX as usize }>::parse(input)?;
72        let u2: u32 = u1.0.try_into().unwrap();
73        Ok(char::from_u32(u2).unwrap())
74    }
75}
76
77// TODO: move to types/catcode.rs
78impl Parsable for types::CatCode {
79    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
80        let (token, i, _) = parse_integer(input)?;
81        if let Ok(val_u8) = u8::try_from(i) {
82            if let Ok(cat_code) = types::CatCode::try_from(val_u8) {
83                return Ok(cat_code);
84            }
85        }
86        input.error(parse::Error {
87            expected: "a category code number (an integer in the range [0, 15])".into(),
88            got: Some(token),
89            got_override: format!["got the integer {i}"],
90            annotation_override: "this is where the number started".into(),
91            guidance: "".into(),
92            additional_notes: vec![],
93        })?;
94        Ok(types::CatCode::try_from(0).unwrap())
95    }
96}
97
98const GUIDANCE_BEGINNING: &str =
99    "a number begins with zero or more minus signs followed by one of the following:
100- A decimal digit (0-9), which begins a decimal number.
101- The character ', which indicates the beginning of an octal number
102- The character \", which indicates the beginning of a hexadecimal number
103- The character `, followed by a character token. The character is converted into its UTF-8 number.
104- A command that references a variable, like \\year.
105";
106
107/// TeX.2021.440 (scan_int)
108pub(crate) fn parse_integer<S: TexlangState>(
109    stream: &mut vm::ExpandedStream<S>,
110) -> txl::Result<(token::Token, i32, Option<u8>)> {
111    let sign = parse_optional_signs(stream)?;
112    let first_token = stream.next_or_err(NumberEndOfInputError {})?;
113    let (result, radix) = match first_token.value() {
114        Value::Other('0') => (parse_constant::<S, 10>(stream, 0)?, Some(10_u8)),
115        Value::Other('1') => (parse_constant::<S, 10>(stream, 1)?, Some(10_u8)),
116        Value::Other('2') => (parse_constant::<S, 10>(stream, 2)?, Some(10_u8)),
117        Value::Other('3') => (parse_constant::<S, 10>(stream, 3)?, Some(10_u8)),
118        Value::Other('4') => (parse_constant::<S, 10>(stream, 4)?, Some(10_u8)),
119        Value::Other('5') => (parse_constant::<S, 10>(stream, 5)?, Some(10_u8)),
120        Value::Other('6') => (parse_constant::<S, 10>(stream, 6)?, Some(10_u8)),
121        Value::Other('7') => (parse_constant::<S, 10>(stream, 7)?, Some(10_u8)),
122        Value::Other('8') => (parse_constant::<S, 10>(stream, 8)?, Some(10_u8)),
123        Value::Other('9') => (parse_constant::<S, 10>(stream, 9)?, Some(10_u8)),
124        Value::Other('\'') => (parse_constant::<S, 8>(stream, 0)?, Some(8_u8)),
125        Value::Other('"') => (parse_constant::<S, 16>(stream, 0)?, Some(16_u8)),
126        Value::Other('`') => (parse_character(stream)?, None),
127        Value::CommandRef(command_ref) => (
128            parse_internal_number(stream, first_token, command_ref)?.integer(),
129            None,
130        ),
131        // TeX.2021.446
132        _ => {
133            stream.back(first_token);
134            stream.error(parse::Error::new(
135                "the beginning of a number",
136                Some(first_token),
137                GUIDANCE_BEGINNING,
138            ))?;
139            (0, None)
140        }
141    };
142    let result = match sign {
143        None => result,
144        // The only i32 that is not safe to multiply by -1 is i32::MIN.
145        // Experimentally we observe in this case that TeX wraps and the result
146        // is i32::MIN again.
147        Some(_) => result.wrapping_mul(-1),
148    };
149    Ok((first_token, result, radix))
150}
151
152#[derive(Debug)]
153pub(crate) enum InternalNumber {
154    Integer(i32),
155    Dimen(core::Scaled),
156    Glue(core::Glue),
157}
158
159impl InternalNumber {
160    pub(crate) fn integer(&self) -> i32 {
161        use InternalNumber::*;
162        match self {
163            Integer(i) => *i,
164            Dimen(scaled) => scaled.0,
165            Glue(glue) => glue.width.0,
166        }
167    }
168}
169
170/// This function reimplements TeX.2021.413 (scan_something_internal) under the following
171/// conditions:
172///
173/// - level is int_val, dimen_val, glue_val or mu_val; i.e., the call to this function
174///   is looking for a number, not a token list or font. The token list or font cases
175///   are handled elsewhere.
176///
177/// - negative is false. The negative=true case is handled by the caller to this function.
178///
179/// - The logic around casting between types (i.e. TeX.2021.429) is omitted. Instead
180///   callers of this function perform the casting. The motivation is to make the code
181///   more explicit and avoid parameters that change logic and/or return types.
182pub(crate) fn parse_internal_number<S: TexlangState>(
183    input: &mut vm::ExpandedStream<S>,
184    first_token: token::Token,
185    command_ref: CommandRef,
186) -> txl::Result<InternalNumber> {
187    let cmd = input.commands_map().get_command(&command_ref);
188    match cmd {
189        Some(command::Command::Variable(cmd)) => {
190            match cmd.clone().value(first_token, input)? {
191                variable::ValueRef::Int(i) => Ok(InternalNumber::Integer(*i)),
192                variable::ValueRef::CatCode(c) => Ok(InternalNumber::Integer(*c as i32)),
193                variable::ValueRef::MathCode(c) => Ok(InternalNumber::Integer(c.0 as i32)),
194                variable::ValueRef::Dimen(d) => Ok(InternalNumber::Dimen(*d)),
195                variable::ValueRef::Glue(g) => Ok(InternalNumber::Glue(*g)),
196                variable::ValueRef::Font(_) => {
197                    // This case behaves identically to the TokenListCase
198                    todo!("scan a font into an int?");
199                }
200                variable::ValueRef::TokenList(_) => Err(input.fatal_error(
201                    parse::Error::new(
202                        "the beginning of a number",
203                        Some(first_token),
204                        GUIDANCE_BEGINNING,
205                    )
206                    .with_annotation_override("token list variable"),
207                )),
208            }
209        }
210        Some(command::Command::Character(c)) => Ok(InternalNumber::Integer(*c as i32)),
211        Some(command::Command::MathCharacter(c)) => Ok(InternalNumber::Integer(c.0 as i32)),
212        None
213        | Some(
214            command::Command::Execution(..)
215            | command::Command::Expansion(..)
216            | command::Command::Macro(..)
217            | command::Command::CharacterTokenAlias(..)
218            | command::Command::Font(..),
219        ) => {
220            let err = parse::Error::new(
221                "the beginning of a number",
222                Some(first_token),
223                GUIDANCE_BEGINNING,
224            )
225            .with_annotation_override(match cmd {
226                None => "undefined control sequence".to_string(),
227                Some(cmd) => format!["control sequence referencing {cmd}"],
228            });
229            input.expansions_mut().push(first_token);
230            Err(input.fatal_error(err))
231        }
232    }
233}
234
235#[derive(Debug)]
236struct NumberEndOfInputError;
237
238impl error::EndOfInputError for NumberEndOfInputError {
239    fn doing(&self) -> String {
240        "parsing a number".into()
241    }
242    fn notes(&self) -> Vec<error::display::Note> {
243        vec![GUIDANCE_BEGINNING.into()]
244    }
245}
246
247/// Parses optional signs and spaces.
248///
249/// If the combination of the signs is positive, [None] is returned.
250/// Otherwise, the Token corresponding to the last negative sign is returned.
251///
252/// This is TeX.2021.441.
253pub fn parse_optional_signs<S: TexlangState>(
254    stream: &mut vm::ExpandedStream<S>,
255) -> txl::Result<Option<token::Token>> {
256    let mut result = None;
257    while let Some((sign, token)) = get_optional_element_with_token![
258        stream,
259        Value::Other('+') => true,
260        Value::Other('-') => false,
261        Value::Space(_) => true,
262    ] {
263        result = match (result, sign) {
264            (None, false) => Some(token),
265            (Some(_), false) => None,
266            (result, true) => result,
267        };
268    }
269    Ok(result)
270}
271
272// TeX.2021.442
273fn parse_character<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<i32> {
274    // BUG: should be from the unexpanded stream
275    let c = {
276        let token = input.next_or_err(CharacterError {})?;
277        match token.value() {
278            Value::CommandRef(token::CommandRef::ControlSequence(cs_name)) => {
279                let name = input.vm().cs_name_interner().resolve(cs_name).unwrap();
280                let mut iter = name.chars();
281                match (iter.next(), iter.count()) {
282                    // (None, 0) => ?! TODO: add a test for this.
283                    // Should be something like:
284                    // \expandafter \i \expandafter ` \csname\endcsname
285                    (Some(c), 0) => c,
286                    _ => {
287                        input.error(parse::Error::new(
288                            "a character",
289                            Some(token),
290                            "a character is a character token or single-character control sequence like \\a",
291                        ))?;
292                        '0'
293                    }
294                }
295            }
296            _ => token.char().unwrap(),
297        }
298    };
299    super::OptionalSpace::parse(input)?;
300    Ok(c as i32)
301}
302
303#[derive(Debug)]
304struct CharacterError;
305
306impl error::EndOfInputError for CharacterError {
307    fn doing(&self) -> String {
308        "parsing a character".into()
309    }
310
311    fn notes(&self) -> Vec<error::display::Note> {
312        vec![
313            r"a character is a character token or single-character control sequence like \a".into(),
314        ]
315    }
316}
317
318/// TeX.2021.444-445
319// TODO: why is the radix a const parameter?
320fn parse_constant<S: TexlangState, const RADIX: i32>(
321    stream: &mut vm::ExpandedStream<S>,
322    mut result: i32,
323) -> txl::Result<i32> {
324    let mut started = RADIX == 10;
325    let mut too_big = false;
326    loop {
327        let next = match stream.next()? {
328            None => break,
329            Some(next) => next,
330        };
331        let lsd_or = match next.value() {
332            token::Value::Other(c) => {
333                let d = (c as u32).wrapping_sub('0' as u32);
334                if d < 10 && d < (RADIX as u32) {
335                    Some(d as i32)
336                } else if RADIX == 16 {
337                    let d = (c as u32).wrapping_sub('A' as u32);
338                    if d < 6 {
339                        Some(d as i32 + 10)
340                    } else {
341                        None
342                    }
343                } else {
344                    None
345                }
346            }
347            token::Value::Letter(c) => {
348                let d = (c as u32).wrapping_sub('A' as u32);
349                if RADIX == 16 && d < 6 {
350                    Some(d as i32 + 10)
351                } else {
352                    None
353                }
354            }
355            _ => None,
356        };
357        let lsd = match lsd_or {
358            None => {
359                stream.back(next);
360                break;
361            }
362            Some(lsd) => lsd,
363        };
364        started = true;
365        result = match add_lsd::<RADIX>(result, lsd) {
366            Some(n) => n,
367            None => {
368                if !too_big {
369                    stream.error(add_lsd_error::<RADIX>(next, result, lsd))?;
370                    too_big = true;
371                }
372                i32::MAX
373            }
374        }
375    }
376    if !started {
377        let (expected, guidance) = match RADIX {
378            8 => {
379                ("an octal digit",
380                "an octal digit is a token with value 0-7 and category other")
381            },
382            16 => {
383                ("a hexadecimal digit",
384                "a hexadecimal digit is either:\n- A character token with value 0-9 and category other, or\n- A character token with value A-F and category letter or other")
385            }
386            _ => unreachable!(),
387        };
388        let got = stream.peek()?;
389        stream.error(parse::Error::new(expected, got, guidance))?;
390    }
391    super::OptionalSpace::parse(stream)?;
392    Ok(result)
393}
394
395fn add_lsd<const RADIX: i32>(n: i32, lsd: i32) -> Option<i32> {
396    match n.checked_mul(RADIX) {
397        None => None,
398        Some(n) => n.checked_add(lsd),
399    }
400}
401
402fn add_lsd_error<const RADIX: i32>(token: token::Token, n: i32, lsd: i32) -> parse::Error {
403    let (got, range) = match RADIX {
404        8 => (
405            format!["got '{n:o}{lsd:o}"],
406            format!["'{:o}, '{:o}", i32::MIN, i32::MAX],
407        ),
408        10 => (
409            format!["got {n}{lsd}"],
410            format!["{}, {}", i32::MIN, i32::MAX],
411        ),
412        16 => (
413            format!["got 0x{n:X}{lsd:X}"],
414            format!["0x{:X}, 0x{:X}", i32::MIN, i32::MAX],
415        ),
416        _ => panic!("radix must be 8, 10 or 16"),
417    };
418    parse::Error {
419        expected: format!["a number in the range [{range}]"],
420        got: Some(token),
421        got_override: got,
422        annotation_override: "this digit makes the number too big".into(),
423        guidance: "".into(),
424        additional_notes: vec![],
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use crate::parse::testing::*;
432
433    parse_success_tests![
434        (octal_0, "'0", 0),
435        (octal_1, "'1", 1),
436        (octal_2, "'2", 2),
437        (octal_3, "'3", 3),
438        (octal_4, "'4", 4),
439        (octal_5, "'5", 5),
440        (octal_6, "'6", 6),
441        (octal_7, "'7", 7),
442        (octal_8, "'10", 8),
443        (octal_9, "'11", 9),
444        (octal_19, "'12", 10),
445        (octal_11, "'13", 11),
446        (octal_12, "'14", 12),
447        (octal_13, "'15", 13),
448        (octal_14, "'16", 14),
449        (octal_15, "'17", 15),
450        (octal_129, "'201", 129),
451        (octal_max, "'17777777777", 2147483647),
452        (octal_min, "-'17777777777", -2147483647),
453        (decimal_0, "0", 0),
454        (decimal_1, "1", 1),
455        (decimal_2, "2", 2),
456        (decimal_3, "3", 3),
457        (decimal_4, "4", 4),
458        (decimal_5, "5", 5),
459        (decimal_6, "6", 6),
460        (decimal_7, "7", 7),
461        (decimal_8, "8", 8),
462        (decimal_9, "9", 9),
463        (decimal_10, "10", 10),
464        (decimal_11, "11", 11),
465        (decimal_12, "12", 12),
466        (decimal_13, "13", 13),
467        (decimal_14, "14", 14),
468        (decimal_15, "15", 15),
469        (decimal_16, "16", 16),
470        (decimal_17, "17", 17),
471        (decimal_18, "18", 18),
472        (decimal_19, "19", 19),
473        (decimal_1_with_0_padding, "00019", 19),
474        (decimal_201, "201", 201),
475        (decimal_max, "2147483647", 2147483647),
476        (decimal_min, "-2147483647", -2147483647),
477        (hexadecimal_0, "\"0", 0),
478        (hexadecimal_1, "\"1", 1),
479        (hexadecimal_2, "\"2", 2),
480        (hexadecimal_3, "\"3", 3),
481        (hexadecimal_4, "\"4", 4),
482        (hexadecimal_5, "\"5", 5),
483        (hexadecimal_6, "\"6", 6),
484        (hexadecimal_7, "\"7", 7),
485        (hexadecimal_8, "\"8", 8),
486        (hexadecimal_9, "\"9", 9),
487        (hexadecimal_10, "\"A", 10),
488        (hexadecimal_11, "\"B", 11),
489        (hexadecimal_12, "\"C", 12),
490        (hexadecimal_13, "\"D", 13),
491        (hexadecimal_14, "\"E", 14),
492        (hexadecimal_15, "\"F", 15),
493        (hexadecimal_16, "\"10", 16),
494        (hexadecimal_17, "\"11", 17),
495        (hexadecimal_18, "\"12", 18),
496        (hexadecimal_19, "\"13", 19),
497        (hexadecimal_20, "\"14", 20),
498        (hexadecimal_21, "\"15", 21),
499        (hexadecimal_22, "\"16", 22),
500        (hexadecimal_23, "\"17", 23),
501        (hexadecimal_24, "\"18", 24),
502        (hexadecimal_25, "\"19", 25),
503        (hexadecimal_26, "\"1A", 26),
504        (hexadecimal_27, "\"1B", 27),
505        (hexadecimal_28, "\"1C", 28),
506        (hexadecimal_29, "\"1D", 29),
507        (hexadecimal_30, "\"1E", 30),
508        (hexadecimal_31, "\"1F", 31),
509        (hexadecimal_513, "\"201", 513),
510        (hexadecimal_max, "\"7FFFFFFF", 2147483647),
511        (hexadecimal_min, "-\"7FFFFFFF", -2147483647),
512        (number_from_character, "`A", 65),
513        (number_from_length_1_control_sequence, r"`\A", 65),
514        (number_from_character_non_ascii, "`ö", 0x00F6),
515        (
516            number_from_length_1_control_sequence_non_ascii,
517            r"`\ö",
518            0x00F6
519        ),
520        (signs_plus, r"+4", 4),
521        (signs_minus, r"-4", -4),
522        (signs_plus_minus, r"+-4", -4),
523        (signs_minus_minus, r"--4", 4),
524        (signs_minus_minus_spaces, r"  -  - 4", 4),
525    ];
526
527    #[derive(Default)]
528    struct State;
529
530    impl TexlangState for State {
531        fn cat_code(&self, c: char) -> types::CatCode {
532            if c == '9' {
533                return types::CatCode::Letter;
534            }
535            types::CatCode::PLAIN_TEX_DEFAULTS
536                .get(c as usize)
537                .copied()
538                .unwrap_or_default()
539        }
540    }
541
542    parse_failure_tests![
543        i32,
544        State,
545        (number_with_letter_catcode, "9"),
546        (octal_too_big, "'177777777770", i32::MAX),
547        (octal_empty, "'"),
548        (decimal_too_big_1, "2147483648", i32::MAX),
549        (decimal_too_big_2, "500000000000000", i32::MAX),
550        (decimal_too_negative_1, "-2147483648", -1 * i32::MAX),
551        (decimal_too_negative_2, "-5000000000000", -1 * i32::MAX),
552        (hexadecimal_too_big, "\"7FFFFFFF0", i32::MAX),
553        (hexadecimal_empty, "\""),
554        (character, "A"),
555        // TODO: the test is messed up because a space gets appended to the input
556        // (character_missing, r"`", '0' as i32),
557        (control_sequence_too_big, r"`\BC", '0' as i32),
558    ];
559
560    parse_failure_tests![
561        Uint::<16>,
562        State,
563        (number_too_big, "16"),
564        (number_is_negative, "-1"),
565    ];
566}