texlang/parse/
integer.rs

1//! Number parsing.
2//!
3//! The number may be octal, decimal, hexadecimal, cast from a character token, or read
4//! from an internal registers. The full definition of a number in the TeX grammar
5//! is given on page X of the TeXBook.
6
7use crate::prelude as txl;
8use crate::token::{CommandRef, Value};
9use crate::traits::*;
10use crate::*;
11
12impl Parsable for i32 {
13    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
14        let (_, i, _) = parse_integer(input)?;
15        Ok(i)
16    }
17}
18
19impl Parsable for u8 {
20    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
21        let u = Uint::<256>::parse_impl(input)?;
22        Ok(u.0.try_into().expect("smaller than 256 so in range"))
23    }
24}
25/// When parsed, this type returns a nonnegative integer with the provided upper bound.
26///
27/// This type is used to implement the following parsing logic in TeX:
28///
29/// - TeX.2021.433 (scan_eight_bit_int) where N=256.
30/// - TeX.2021.434 (scan_char_num) where N=256.
31/// - TeX.2021.435 (scan_four_bit_int) where N=16.
32/// - TeX.2021.436 (scan_fifteen_bit_int) where N=2^15.
33/// - TeX.2021.437 (scan_twenty_seven_bit_int) where N=2^27.
34#[derive(Debug, PartialEq, Eq, Default)]
35pub struct Uint<const N: usize>(pub usize);
36
37impl Uint<0> {
38    pub const MAX: usize = i32::MAX as usize;
39}
40
41impl<const N: usize> Parsable for Uint<N> {
42    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
43        let (first_token, i, _) = parse_integer(input)?;
44        if i < 0 || i as usize >= N {
45            input.error(OutOfBoundsError::<N> {
46                first_token,
47                got: i,
48            })?;
49            Ok(Uint(0))
50        } else {
51            Ok(Uint(i as usize))
52        }
53    }
54}
55
56#[derive(Debug)]
57struct OutOfBoundsError<const N: usize> {
58    first_token: token::Token,
59    got: i32,
60}
61
62impl<const N: usize> error::TexError for OutOfBoundsError<N> {
63    fn kind(&self) -> error::Kind {
64        error::Kind::Token(self.first_token)
65    }
66
67    fn title(&self) -> String {
68        format!(
69            "expected an integer in the range [0, {}), got {}",
70            N, self.got
71        )
72    }
73}
74
75impl Parsable for char {
76    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
77        let u1 = Uint::<{ char::MAX as usize }>::parse(input)?;
78        let u2: u32 = u1.0.try_into().unwrap();
79        Ok(char::from_u32(u2).unwrap())
80    }
81}
82
83// TODO: move to types/catcode.rs
84impl Parsable for types::CatCode {
85    fn parse_impl<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<Self> {
86        let (token, i, _) = parse_integer(input)?;
87        if let Ok(val_u8) = u8::try_from(i) {
88            if let Ok(cat_code) = types::CatCode::try_from(val_u8) {
89                return Ok(cat_code);
90            }
91        }
92        input.error(parse::Error {
93            expected: "a category code number (an integer in the range [0, 15])".into(),
94            got: Some(token),
95            got_override: format!["got the integer {i}"],
96            annotation_override: "this is where the number started".into(),
97            guidance: "".into(),
98            additional_notes: vec![],
99        })?;
100        Ok(types::CatCode::try_from(0).unwrap())
101    }
102}
103
104const GUIDANCE_BEGINNING: &str =
105    "a number begins with zero or more minus signs followed by one of the following:
106- A decimal digit (0-9), which begins a decimal number.
107- The character ', which indicates the beginning of an octal number
108- The character \", which indicates the beginning of a hexadecimal number
109- The character `, followed by a character token. The character is converted into its UTF-8 number.
110- A command that references a variable, like \\year.
111";
112
113/// TeX.2021.440 (scan_int)
114pub(crate) fn parse_integer<S: TexlangState>(
115    stream: &mut vm::ExpandedStream<S>,
116) -> txl::Result<(token::Token, i32, Option<u8>)> {
117    let sign = parse_optional_signs(stream)?;
118    let first_token = stream.next_or_err(NumberEndOfInputError {})?;
119    let (result, radix) = match first_token.value() {
120        Value::Other('0') => (parse_constant::<S, 10>(stream, 0)?, Some(10_u8)),
121        Value::Other('1') => (parse_constant::<S, 10>(stream, 1)?, Some(10_u8)),
122        Value::Other('2') => (parse_constant::<S, 10>(stream, 2)?, Some(10_u8)),
123        Value::Other('3') => (parse_constant::<S, 10>(stream, 3)?, Some(10_u8)),
124        Value::Other('4') => (parse_constant::<S, 10>(stream, 4)?, Some(10_u8)),
125        Value::Other('5') => (parse_constant::<S, 10>(stream, 5)?, Some(10_u8)),
126        Value::Other('6') => (parse_constant::<S, 10>(stream, 6)?, Some(10_u8)),
127        Value::Other('7') => (parse_constant::<S, 10>(stream, 7)?, Some(10_u8)),
128        Value::Other('8') => (parse_constant::<S, 10>(stream, 8)?, Some(10_u8)),
129        Value::Other('9') => (parse_constant::<S, 10>(stream, 9)?, Some(10_u8)),
130        Value::Other('\'') => (parse_constant::<S, 8>(stream, 0)?, Some(8_u8)),
131        Value::Other('"') => (parse_constant::<S, 16>(stream, 0)?, Some(16_u8)),
132        Value::Other('`') => (parse_character(stream)?, None),
133        Value::CommandRef(command_ref) => (
134            parse_internal_number(stream, first_token, command_ref)?.integer(),
135            None,
136        ),
137        // TeX.2021.446
138        _ => {
139            stream.back(first_token);
140            stream.error(parse::Error::new(
141                "the beginning of a number",
142                Some(first_token),
143                GUIDANCE_BEGINNING,
144            ))?;
145            (0, None)
146        }
147    };
148    let result = match sign {
149        None => result,
150        // The only i32 that is not safe to multiply by -1 is i32::MIN.
151        // Experimentally we observe in this case that TeX wraps and the result
152        // is i32::MIN again.
153        Some(_) => result.wrapping_mul(-1),
154    };
155    Ok((first_token, result, radix))
156}
157
158#[derive(Debug)]
159pub(crate) enum InternalNumber {
160    Integer(i32),
161    Dimen(common::Scaled),
162    Glue(common::Glue),
163}
164
165impl InternalNumber {
166    pub(crate) fn integer(&self) -> i32 {
167        use InternalNumber::*;
168        match self {
169            Integer(i) => *i,
170            Dimen(scaled) => scaled.0,
171            Glue(glue) => glue.width.0,
172        }
173    }
174}
175
176/// This function reimplements TeX.2021.413 (scan_something_internal) under the following
177/// conditions:
178///
179/// - level is int_val, dimen_val, glue_val or mu_val; i.e., the call to this function
180///   is looking for a number, not a token list or font. The token list or font cases
181///   are handled elsewhere.
182///
183/// - negative is false. The negative=true case is handled by the caller to this function.
184///
185/// - The logic around casting between types (i.e. TeX.2021.429) is omitted. Instead
186///   callers of this function perform the casting. The motivation is to make the code
187///   more explicit and avoid parameters that change logic and/or return types.
188pub(crate) fn parse_internal_number<S: TexlangState>(
189    input: &mut vm::ExpandedStream<S>,
190    first_token: token::Token,
191    command_ref: CommandRef,
192) -> txl::Result<InternalNumber> {
193    let cmd = input.commands_map().get_command(&command_ref);
194    match cmd {
195        Some(command::Command::Variable(cmd)) => {
196            match cmd.clone().value(first_token, input)? {
197                variable::ValueRef::Int(i) => Ok(InternalNumber::Integer(*i)),
198                variable::ValueRef::SmallInt(c) => Ok(InternalNumber::Integer(*c as i32)),
199                variable::ValueRef::CatCode(c) => Ok(InternalNumber::Integer(*c as i32)),
200                variable::ValueRef::MathCode(c) => Ok(InternalNumber::Integer(c.0 as i32)),
201                variable::ValueRef::Dimen(d) => Ok(InternalNumber::Dimen(*d)),
202                variable::ValueRef::Glue(g) => Ok(InternalNumber::Glue(*g)),
203                variable::ValueRef::Font(_) => {
204                    // This case behaves identically to the TokenListCase
205                    todo!("scan a font into an int?");
206                }
207                variable::ValueRef::TokenList(_) => Err(input.fatal_error(
208                    parse::Error::new(
209                        "the beginning of a number",
210                        Some(first_token),
211                        GUIDANCE_BEGINNING,
212                    )
213                    .with_annotation_override("token list variable"),
214                )),
215            }
216        }
217        Some(command::Command::Character(c)) => Ok(InternalNumber::Integer(*c as i32)),
218        Some(command::Command::MathCharacter(c)) => Ok(InternalNumber::Integer(c.0 as i32)),
219        None
220        | Some(
221            command::Command::Execution(..)
222            | command::Command::Expansion(..)
223            | command::Command::Macro(..)
224            | command::Command::CharacterTokenAlias(..)
225            | command::Command::Font(..),
226        ) => {
227            let err = parse::Error::new(
228                "the beginning of a number",
229                Some(first_token),
230                GUIDANCE_BEGINNING,
231            )
232            .with_annotation_override(match cmd {
233                None => "undefined control sequence".to_string(),
234                Some(cmd) => format!["control sequence referencing {cmd}"],
235            });
236            input.expansions_mut().push(first_token);
237            Err(input.fatal_error(err))
238        }
239    }
240}
241
242#[derive(Debug)]
243struct NumberEndOfInputError;
244
245impl error::EndOfInputError for NumberEndOfInputError {
246    fn doing(&self) -> String {
247        "parsing a number".into()
248    }
249    fn notes(&self) -> Vec<error::display::Note> {
250        vec![GUIDANCE_BEGINNING.into()]
251    }
252}
253
254/// Parses optional signs and spaces.
255///
256/// If the combination of the signs is positive, [None] is returned.
257/// Otherwise, the Token corresponding to the last negative sign is returned.
258///
259/// This is TeX.2021.441.
260pub fn parse_optional_signs<S: TexlangState>(
261    stream: &mut vm::ExpandedStream<S>,
262) -> txl::Result<Option<token::Token>> {
263    let mut result = None;
264    while let Some((sign, token)) = get_optional_element_with_token![
265        stream,
266        Value::Other('+') => true,
267        Value::Other('-') => false,
268        Value::Space(_) => true,
269    ] {
270        result = match (result, sign) {
271            (None, false) => Some(token),
272            (Some(_), false) => None,
273            (result, true) => result,
274        };
275    }
276    Ok(result)
277}
278
279// TeX.2021.442
280fn parse_character<S: TexlangState>(input: &mut vm::ExpandedStream<S>) -> txl::Result<i32> {
281    // BUG: should be from the unexpanded stream
282    let c = {
283        let token = input.next_or_err(CharacterError {})?;
284        match token.value() {
285            Value::CommandRef(token::CommandRef::ControlSequence(cs_name)) => {
286                let name = input.vm().cs_name_interner().resolve(cs_name).unwrap();
287                let mut iter = name.chars();
288                match (iter.next(), iter.count()) {
289                    // (None, 0) => ?! TODO: add a test for this.
290                    // Should be something like:
291                    // \expandafter \i \expandafter ` \csname\endcsname
292                    (Some(c), 0) => c,
293                    _ => {
294                        input.error(parse::Error::new(
295                            "a character",
296                            Some(token),
297                            "a character is a character token or single-character control sequence like \\a",
298                        ))?;
299                        '0'
300                    }
301                }
302            }
303            _ => token.char().unwrap(),
304        }
305    };
306    super::OptionalSpace::parse(input)?;
307    Ok(c as i32)
308}
309
310#[derive(Debug)]
311struct CharacterError;
312
313impl error::EndOfInputError for CharacterError {
314    fn doing(&self) -> String {
315        "parsing a character".into()
316    }
317
318    fn notes(&self) -> Vec<error::display::Note> {
319        vec![
320            r"a character is a character token or single-character control sequence like \a".into(),
321        ]
322    }
323}
324
325/// TeX.2021.444-445
326// TODO: why is the radix a const parameter?
327fn parse_constant<S: TexlangState, const RADIX: i32>(
328    stream: &mut vm::ExpandedStream<S>,
329    mut result: i32,
330) -> txl::Result<i32> {
331    let mut started = RADIX == 10;
332    let mut too_big = false;
333    loop {
334        let next = match stream.next()? {
335            None => break,
336            Some(next) => next,
337        };
338        let lsd_or = match next.value() {
339            token::Value::Other(c) => {
340                let d = (c as u32).wrapping_sub('0' as u32);
341                if d < 10 && d < (RADIX as u32) {
342                    Some(d as i32)
343                } else if RADIX == 16 {
344                    let d = (c as u32).wrapping_sub('A' as u32);
345                    if d < 6 {
346                        Some(d as i32 + 10)
347                    } else {
348                        None
349                    }
350                } else {
351                    None
352                }
353            }
354            token::Value::Letter(c) => {
355                let d = (c as u32).wrapping_sub('A' as u32);
356                if RADIX == 16 && d < 6 {
357                    Some(d as i32 + 10)
358                } else {
359                    None
360                }
361            }
362            _ => None,
363        };
364        let lsd = match lsd_or {
365            None => {
366                stream.back(next);
367                break;
368            }
369            Some(lsd) => lsd,
370        };
371        started = true;
372        result = match add_lsd::<RADIX>(result, lsd) {
373            Some(n) => n,
374            None => {
375                if !too_big {
376                    stream.error(add_lsd_error::<RADIX>(next, result, lsd))?;
377                    too_big = true;
378                }
379                i32::MAX
380            }
381        }
382    }
383    if !started {
384        let (expected, guidance) = match RADIX {
385            8 => {
386                ("an octal digit",
387                "an octal digit is a token with value 0-7 and category other")
388            },
389            16 => {
390                ("a hexadecimal digit",
391                "a hexadecimal digit is either:\n- A character token with value 0-9 and category other, or\n- A character token with value A-F and category letter or other")
392            }
393            _ => unreachable!(),
394        };
395        let got = stream.peek()?;
396        stream.error(parse::Error::new(expected, got, guidance))?;
397    }
398    super::OptionalSpace::parse(stream)?;
399    Ok(result)
400}
401
402fn add_lsd<const RADIX: i32>(n: i32, lsd: i32) -> Option<i32> {
403    match n.checked_mul(RADIX) {
404        None => None,
405        Some(n) => n.checked_add(lsd),
406    }
407}
408
409fn add_lsd_error<const RADIX: i32>(token: token::Token, n: i32, lsd: i32) -> parse::Error {
410    let (got, range) = match RADIX {
411        8 => (
412            format!["got '{n:o}{lsd:o}"],
413            format!["'{:o}, '{:o}", i32::MIN, i32::MAX],
414        ),
415        10 => (
416            format!["got {n}{lsd}"],
417            format!["{}, {}", i32::MIN, i32::MAX],
418        ),
419        16 => (
420            format!["got 0x{n:X}{lsd:X}"],
421            format!["0x{:X}, 0x{:X}", i32::MIN, i32::MAX],
422        ),
423        _ => panic!("radix must be 8, 10 or 16"),
424    };
425    parse::Error {
426        expected: format!["a number in the range [{range}]"],
427        got: Some(token),
428        got_override: got,
429        annotation_override: "this digit makes the number too big".into(),
430        guidance: "".into(),
431        additional_notes: vec![],
432    }
433}
434
435#[cfg(test)]
436mod tests {
437    use super::*;
438    use crate::parse::testing::*;
439
440    parse_success_tests![
441        (octal_0, "'0", 0),
442        (octal_1, "'1", 1),
443        (octal_2, "'2", 2),
444        (octal_3, "'3", 3),
445        (octal_4, "'4", 4),
446        (octal_5, "'5", 5),
447        (octal_6, "'6", 6),
448        (octal_7, "'7", 7),
449        (octal_8, "'10", 8),
450        (octal_9, "'11", 9),
451        (octal_19, "'12", 10),
452        (octal_11, "'13", 11),
453        (octal_12, "'14", 12),
454        (octal_13, "'15", 13),
455        (octal_14, "'16", 14),
456        (octal_15, "'17", 15),
457        (octal_129, "'201", 129),
458        (octal_max, "'17777777777", 2147483647),
459        (octal_min, "-'17777777777", -2147483647),
460        (decimal_0, "0", 0),
461        (decimal_1, "1", 1),
462        (decimal_2, "2", 2),
463        (decimal_3, "3", 3),
464        (decimal_4, "4", 4),
465        (decimal_5, "5", 5),
466        (decimal_6, "6", 6),
467        (decimal_7, "7", 7),
468        (decimal_8, "8", 8),
469        (decimal_9, "9", 9),
470        (decimal_10, "10", 10),
471        (decimal_11, "11", 11),
472        (decimal_12, "12", 12),
473        (decimal_13, "13", 13),
474        (decimal_14, "14", 14),
475        (decimal_15, "15", 15),
476        (decimal_16, "16", 16),
477        (decimal_17, "17", 17),
478        (decimal_18, "18", 18),
479        (decimal_19, "19", 19),
480        (decimal_1_with_0_padding, "00019", 19),
481        (decimal_201, "201", 201),
482        (decimal_max, "2147483647", 2147483647),
483        (decimal_min, "-2147483647", -2147483647),
484        (hexadecimal_0, "\"0", 0),
485        (hexadecimal_1, "\"1", 1),
486        (hexadecimal_2, "\"2", 2),
487        (hexadecimal_3, "\"3", 3),
488        (hexadecimal_4, "\"4", 4),
489        (hexadecimal_5, "\"5", 5),
490        (hexadecimal_6, "\"6", 6),
491        (hexadecimal_7, "\"7", 7),
492        (hexadecimal_8, "\"8", 8),
493        (hexadecimal_9, "\"9", 9),
494        (hexadecimal_10, "\"A", 10),
495        (hexadecimal_11, "\"B", 11),
496        (hexadecimal_12, "\"C", 12),
497        (hexadecimal_13, "\"D", 13),
498        (hexadecimal_14, "\"E", 14),
499        (hexadecimal_15, "\"F", 15),
500        (hexadecimal_16, "\"10", 16),
501        (hexadecimal_17, "\"11", 17),
502        (hexadecimal_18, "\"12", 18),
503        (hexadecimal_19, "\"13", 19),
504        (hexadecimal_20, "\"14", 20),
505        (hexadecimal_21, "\"15", 21),
506        (hexadecimal_22, "\"16", 22),
507        (hexadecimal_23, "\"17", 23),
508        (hexadecimal_24, "\"18", 24),
509        (hexadecimal_25, "\"19", 25),
510        (hexadecimal_26, "\"1A", 26),
511        (hexadecimal_27, "\"1B", 27),
512        (hexadecimal_28, "\"1C", 28),
513        (hexadecimal_29, "\"1D", 29),
514        (hexadecimal_30, "\"1E", 30),
515        (hexadecimal_31, "\"1F", 31),
516        (hexadecimal_513, "\"201", 513),
517        (hexadecimal_max, "\"7FFFFFFF", 2147483647),
518        (hexadecimal_min, "-\"7FFFFFFF", -2147483647),
519        (number_from_character, "`A", 65),
520        (number_from_length_1_control_sequence, r"`\A", 65),
521        (number_from_character_non_ascii, "`ö", 0x00F6),
522        (
523            number_from_length_1_control_sequence_non_ascii,
524            r"`\ö",
525            0x00F6
526        ),
527        (signs_plus, r"+4", 4),
528        (signs_minus, r"-4", -4),
529        (signs_plus_minus, r"+-4", -4),
530        (signs_minus_minus, r"--4", 4),
531        (signs_minus_minus_spaces, r"  -  - 4", 4),
532    ];
533
534    #[derive(Default)]
535    struct State;
536
537    impl TexlangState for State {
538        fn cat_code(&self, c: char) -> types::CatCode {
539            if c == '9' {
540                return types::CatCode::Letter;
541            }
542            types::CatCode::PLAIN_TEX_DEFAULTS
543                .get(c as usize)
544                .copied()
545                .unwrap_or_default()
546        }
547    }
548
549    parse_failure_tests![
550        i32,
551        State,
552        (number_with_letter_catcode, "9"),
553        (octal_too_big, "'177777777770", i32::MAX),
554        (octal_empty, "'"),
555        (decimal_too_big_1, "2147483648", i32::MAX),
556        (decimal_too_big_2, "500000000000000", i32::MAX),
557        (decimal_too_negative_1, "-2147483648", -1 * i32::MAX),
558        (decimal_too_negative_2, "-5000000000000", -1 * i32::MAX),
559        (hexadecimal_too_big, "\"7FFFFFFF0", i32::MAX),
560        (hexadecimal_empty, "\""),
561        (character, "A"),
562        // TODO: the test is messed up because a space gets appended to the input
563        // (character_missing, r"`", '0' as i32),
564        (control_sequence_too_big, r"`\BC", '0' as i32),
565    ];
566
567    parse_failure_tests![
568        Uint::<16>,
569        State,
570        (number_too_big, "16"),
571        (number_is_negative, "-1"),
572    ];
573}