texlang/token/
mod.rs

1//! TeX tokens and category codes.
2
3pub mod lexer;
4pub mod trace;
5use crate::types::CatCode;
6use std::{fmt::Display, num};
7use texcraft_stdext::collections::interner;
8
9/// String type used to represent control sequence names in Texlang.
10///
11/// The implementation of this type is opaque so that it can be performance optimized
12/// without worrying about downstream consumers.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
14#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
15pub struct CsName(num::NonZeroU32);
16
17impl CsName {
18    #[inline]
19    pub fn to_usize(&self) -> usize {
20        self.0.get() as usize
21    }
22
23    pub fn try_from_usize(u: usize) -> Option<CsName> {
24        let u = match u32::try_from(u) {
25            Ok(u) => u,
26            Err(_) => return None,
27        };
28        num::NonZeroU32::new(u).map(CsName)
29    }
30}
31
32/// String interner for control sequence names.
33pub type CsNameInterner = interner::Interner<CsName>;
34
35impl interner::Key for CsName {
36    fn try_from_usize(index: usize) -> Option<Self> {
37        num::NonZeroU32::try_from_usize(index).map(CsName)
38    }
39
40    fn into_usize(self) -> usize {
41        self.0.into_usize()
42    }
43}
44
45/// The value of a token.
46#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub enum Value {
49    BeginGroup(char),
50    EndGroup(char),
51    MathShift(char),
52    AlignmentTab(char),
53    Parameter(char),
54    Superscript(char),
55    Subscript(char),
56    Space(char),
57    Letter(char),
58    Other(char),
59    CommandRef(CommandRef),
60}
61
62/// The value of a token that references a command
63#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
64#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
65pub enum CommandRef {
66    ControlSequence(CsName),
67    ActiveCharacter(char),
68}
69
70impl CommandRef {
71    pub fn to_string(&self, cs_name_interner: &CsNameInterner) -> String {
72        match self {
73            CommandRef::ControlSequence(cs_name) => {
74                format!("\\{}", cs_name_interner.resolve(*cs_name).unwrap())
75            }
76            CommandRef::ActiveCharacter(c) => format!("{c}"),
77        }
78    }
79}
80
81impl Value {
82    pub fn new(c: char, cat_code: CatCode) -> Value {
83        match cat_code {
84            CatCode::BeginGroup => Value::BeginGroup(c),
85            CatCode::EndGroup => Value::EndGroup(c),
86            CatCode::MathShift => Value::MathShift(c),
87            CatCode::AlignmentTab => Value::AlignmentTab(c),
88            CatCode::Parameter => Value::Parameter(c),
89            CatCode::Superscript => Value::Superscript(c),
90            CatCode::Subscript => Value::Subscript(c),
91            CatCode::Space => Value::Space(c),
92            CatCode::Letter => Value::Letter(c),
93            CatCode::Other => Value::Other(c),
94            CatCode::Active => Value::CommandRef(CommandRef::ActiveCharacter(c)),
95            _ => panic!("raw cat code not allowed"),
96        }
97    }
98
99    /// TODO: should have a char_and_catcode function
100    pub fn char(&self) -> Option<char> {
101        match *self {
102            Value::BeginGroup(c) => Some(c),
103            Value::EndGroup(c) => Some(c),
104            Value::MathShift(c) => Some(c),
105            Value::AlignmentTab(c) => Some(c),
106            Value::Parameter(c) => Some(c),
107            Value::Superscript(c) => Some(c),
108            Value::Subscript(c) => Some(c),
109            Value::Space(c) => Some(c),
110            Value::Letter(c) => Some(c),
111            Value::Other(c) => Some(c),
112            Value::CommandRef(command_ref) => match command_ref {
113                CommandRef::ControlSequence(_) => None,
114                CommandRef::ActiveCharacter(c) => Some(c),
115            },
116        }
117    }
118
119    pub fn cat_code(&self) -> Option<CatCode> {
120        match self {
121            Value::BeginGroup(_) => Some(CatCode::BeginGroup),
122            Value::EndGroup(_) => Some(CatCode::EndGroup),
123            Value::MathShift(_) => Some(CatCode::MathShift),
124            Value::AlignmentTab(_) => Some(CatCode::AlignmentTab),
125            Value::Parameter(_) => Some(CatCode::Parameter),
126            Value::Superscript(_) => Some(CatCode::Superscript),
127            Value::Subscript(_) => Some(CatCode::Subscript),
128            Value::Space(_) => Some(CatCode::Space),
129            Value::Letter(_) => Some(CatCode::Letter),
130            Value::Other(_) => Some(CatCode::Other),
131            Value::CommandRef(command_ref) => match command_ref {
132                CommandRef::ControlSequence(_) => None,
133                CommandRef::ActiveCharacter(_) => Some(CatCode::Active),
134            },
135        }
136    }
137}
138
139/// A TeX token.
140#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
141#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
142pub struct Token {
143    value: Value,
144    trace_key: trace::Key,
145}
146
147impl std::fmt::Display for Token {
148    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149        match &self.value {
150            Value::CommandRef(_) => {
151                write![f, "todo"] // TODO
152            }
153            _ => {
154                write![f, "{}", self.char().unwrap()]
155            }
156        }
157    }
158}
159
160macro_rules! token_constructor {
161    ($name: ident, $value: expr) => {
162        pub fn $name(c: char, trace_key: trace::Key) -> Token {
163            Token {
164                value: $value(c),
165                trace_key,
166            }
167        }
168    };
169}
170
171impl Token {
172    token_constructor!(new_begin_group, Value::BeginGroup);
173    token_constructor!(new_end_group, Value::EndGroup);
174    token_constructor!(new_math_shift, Value::MathShift);
175    token_constructor!(new_alignment_tab, Value::AlignmentTab);
176    token_constructor!(new_parameter, Value::Parameter);
177    token_constructor!(new_superscript, Value::Superscript);
178    token_constructor!(new_subscript, Value::Subscript);
179    token_constructor!(new_space, Value::Space);
180    token_constructor!(new_letter, Value::Letter);
181    token_constructor!(new_other, Value::Other);
182
183    pub fn new_command_ref(command_ref: CommandRef, trace_key: trace::Key) -> Token {
184        Token {
185            value: Value::CommandRef(command_ref),
186            trace_key,
187        }
188    }
189
190    pub fn new_active_character(c: char, trace_key: trace::Key) -> Token {
191        Token {
192            value: Value::CommandRef(CommandRef::ActiveCharacter(c)),
193            trace_key,
194        }
195    }
196
197    pub fn new_control_sequence(name: CsName, trace_key: trace::Key) -> Token {
198        Token {
199            value: Value::CommandRef(CommandRef::ControlSequence(name)),
200            trace_key,
201        }
202    }
203
204    pub fn new_from_value(value: Value, trace_key: trace::Key) -> Token {
205        Token { value, trace_key }
206    }
207
208    #[inline]
209    pub fn value(&self) -> Value {
210        self.value
211    }
212
213    #[inline]
214    pub fn trace_key(&self) -> trace::Key {
215        self.trace_key
216    }
217
218    /// TODO: should have a char_and_catcode function
219    pub fn char(&self) -> Option<char> {
220        self.value.char()
221    }
222
223    pub fn cat_code(&self) -> Option<CatCode> {
224        self.value.cat_code()
225    }
226}
227
228#[derive(Default)]
229enum PendingWhitespace {
230    #[default]
231    NotStarted,
232    None,
233    Space,
234    Newlines(usize),
235}
236
237impl PendingWhitespace {
238    fn reset(&mut self) {
239        *self = PendingWhitespace::None;
240    }
241
242    fn add_space(&mut self) {
243        *self = match self {
244            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
245            PendingWhitespace::None => PendingWhitespace::Space,
246            PendingWhitespace::Space => PendingWhitespace::Space,
247            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n),
248        }
249    }
250
251    fn add_newline(&mut self) {
252        *self = match self {
253            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
254            PendingWhitespace::None => PendingWhitespace::Newlines(1),
255            PendingWhitespace::Space => PendingWhitespace::Newlines(1),
256            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n + 1),
257        }
258    }
259
260    fn new_paragraph(&mut self) {
261        *self = match self {
262            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
263            PendingWhitespace::None | PendingWhitespace::Space | PendingWhitespace::Newlines(1) => {
264                PendingWhitespace::Newlines(2)
265            }
266            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n),
267        }
268    }
269}
270
271impl Display for PendingWhitespace {
272    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
273        match self {
274            PendingWhitespace::NotStarted | PendingWhitespace::None => Ok(()),
275            PendingWhitespace::Space => {
276                write!(f, " ")
277            }
278            PendingWhitespace::Newlines(n) => {
279                for _ in 0..*n {
280                    writeln!(f)?;
281                }
282                Ok(())
283            }
284        }
285    }
286}
287
288/// Data structure for writing tokens
289#[derive(Default)]
290pub struct Writer {
291    pending_whitespace: PendingWhitespace,
292}
293
294impl Writer {
295    /// Write a token.
296    pub fn write(
297        &mut self,
298        io_writer: &mut dyn std::io::Write,
299        interner: &CsNameInterner,
300        value: Value,
301    ) -> Result<(), std::io::Error> {
302        match &value {
303            Value::CommandRef(CommandRef::ControlSequence(s)) => {
304                write!(
305                    io_writer,
306                    "{}\\{}",
307                    self.pending_whitespace,
308                    interner.resolve(*s).unwrap()
309                )?;
310                self.pending_whitespace.reset();
311            }
312            Value::Space(_) => self.pending_whitespace.add_space(),
313            _ => {
314                write!(
315                    io_writer,
316                    "{}{}",
317                    self.pending_whitespace,
318                    value.char().unwrap()
319                )?;
320                self.pending_whitespace.reset();
321            }
322        }
323        io_writer.flush()
324    }
325
326    pub fn add_newline(&mut self) {
327        self.pending_whitespace.add_newline();
328    }
329
330    pub fn start_paragraph(&mut self) {
331        self.pending_whitespace.new_paragraph();
332    }
333}
334
335/// Write a collection of tokens to a string.
336pub fn write_tokens<'a, T>(tokens: T, interner: &CsNameInterner) -> String
337where
338    T: IntoIterator<Item = &'a Token>,
339{
340    let mut buffer: Vec<u8> = Default::default();
341    let mut writer: Writer = Default::default();
342    for token in tokens.into_iter() {
343        writer.write(&mut buffer, interner, token.value()).unwrap();
344    }
345    std::str::from_utf8(&buffer).unwrap().into()
346}
347
348/// Write a collection of token values to a string.
349pub fn write_token_values<'a, T>(values: T, interner: &CsNameInterner) -> String
350where
351    T: IntoIterator<Item = &'a Value>,
352{
353    let mut buffer: Vec<u8> = Default::default();
354    let mut writer: Writer = Default::default();
355    for value in values.into_iter() {
356        writer.write(&mut buffer, interner, *value).unwrap();
357    }
358    std::str::from_utf8(&buffer).unwrap().into()
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    enum Instruction {
366        ControlSequence(&'static str),
367        Character(char, CatCode),
368        Newline,
369        NewParagraph,
370    }
371
372    fn writer_test(input: Vec<Instruction>, want: &str) {
373        let mut buffer: Vec<u8> = Default::default();
374        let mut writer: Writer = Default::default();
375        let mut interner = CsNameInterner::default();
376        for pre_interned_token in input {
377            match pre_interned_token {
378                Instruction::ControlSequence(name) => {
379                    let cs_name = interner.get_or_intern(name);
380                    let token = Token::new_control_sequence(cs_name, trace::Key::dummy());
381                    writer.write(&mut buffer, &interner, token.value()).unwrap();
382                }
383                Instruction::Character(c, code) => {
384                    let token = Token::new_from_value(Value::new(c, code), trace::Key::dummy());
385                    writer.write(&mut buffer, &interner, token.value()).unwrap();
386                }
387                Instruction::Newline => {
388                    writer.add_newline();
389                }
390                Instruction::NewParagraph => {
391                    writer.start_paragraph();
392                }
393            };
394        }
395        let got: String = std::str::from_utf8(&buffer).unwrap().into();
396        let want = want.to_string();
397
398        if got != want {
399            println!("Output is different:");
400            println!("------[got]-------");
401            println!("{}", got);
402            println!("------[want]------");
403            println!("{}", want);
404            println!("-----------------");
405            panic!("write_tokens test failed");
406        }
407    }
408
409    macro_rules! write_tokens_tests {
410        ($( ($name: ident, $input: expr, $want: expr), )+) => {
411            $(
412            #[test]
413            fn $name() {
414                writer_test($input, $want);
415            }
416            )+
417        };
418    }
419
420    write_tokens_tests!(
421        (blank, vec!(), ""),
422        (
423            trim_whitespace_from_start,
424            vec![
425                Instruction::Character('\n', CatCode::Space),
426                Instruction::Character('\n', CatCode::Space),
427                Instruction::Character('\n', CatCode::Space),
428                Instruction::Character('H', CatCode::Letter),
429            ],
430            "H"
431        ),
432        (
433            trim_whitespace_from_end,
434            vec![
435                Instruction::Character('H', CatCode::Letter),
436                Instruction::Character('\n', CatCode::Space),
437                Instruction::Character('\n', CatCode::Space),
438                Instruction::Character('\n', CatCode::Space),
439            ],
440            "H"
441        ),
442        (
443            trim_whitespace_from_middle_1,
444            vec![
445                Instruction::Character('H', CatCode::Letter),
446                Instruction::Character(' ', CatCode::Space),
447                Instruction::Character(' ', CatCode::Space),
448                Instruction::Character('W', CatCode::Letter),
449            ],
450            "H W"
451        ),
452        (
453            trim_whitespace_from_middle_2,
454            vec![
455                Instruction::Character('H', CatCode::Letter),
456                Instruction::Character('\n', CatCode::Space),
457                Instruction::Character(' ', CatCode::Space),
458                Instruction::Character('\n', CatCode::Space),
459                Instruction::Character('W', CatCode::Letter),
460            ],
461            "H W"
462        ),
463        (
464            trim_whitespace_from_middle_3,
465            vec![
466                Instruction::Character('H', CatCode::Letter),
467                Instruction::Character('\n', CatCode::Space),
468                Instruction::Character('\n', CatCode::Space),
469                Instruction::Character('\n', CatCode::Space),
470                Instruction::Character('W', CatCode::Letter),
471            ],
472            "H W"
473        ),
474        (
475            control_sequence,
476            vec![Instruction::ControlSequence("HelloWorld"),],
477            "\\HelloWorld"
478        ),
479        (
480            newline_1,
481            vec![
482                Instruction::Character('H', CatCode::Letter),
483                Instruction::Newline,
484                Instruction::Character('W', CatCode::Letter),
485            ],
486            "H\nW"
487        ),
488        (
489            newline_2,
490            vec![
491                Instruction::Character('H', CatCode::Letter),
492                Instruction::Newline,
493                Instruction::Character(' ', CatCode::Space),
494                Instruction::Newline,
495                Instruction::Character('W', CatCode::Letter),
496            ],
497            "H\n\nW"
498        ),
499        (
500            newline_3,
501            vec![
502                Instruction::Character('H', CatCode::Letter),
503                Instruction::Newline,
504                Instruction::Character(' ', CatCode::Space),
505                Instruction::Newline,
506                Instruction::Character(' ', CatCode::Space),
507                Instruction::Newline,
508                Instruction::Character('W', CatCode::Letter),
509            ],
510            "H\n\n\nW"
511        ),
512        (
513            par_1,
514            vec![
515                Instruction::Character('H', CatCode::Letter),
516                Instruction::NewParagraph,
517                Instruction::NewParagraph,
518                Instruction::Character('W', CatCode::Letter),
519            ],
520            "H\n\nW"
521        ),
522        (
523            par_2,
524            vec![
525                Instruction::Character('H', CatCode::Letter),
526                Instruction::NewParagraph,
527                Instruction::NewParagraph,
528                Instruction::NewParagraph,
529                Instruction::Character('W', CatCode::Letter),
530            ],
531            "H\n\nW"
532        ),
533    );
534
535    #[test]
536    fn token_size() {
537        assert_eq!(std::mem::size_of::<CommandRef>(), 8);
538        assert_eq!(std::mem::size_of::<Value>(), 8);
539        assert_eq!(std::mem::size_of::<Token>(), 12);
540        assert_eq!(std::mem::size_of::<Result<Token, ()>>(), 12);
541        assert_eq!(std::mem::size_of::<Result<Option<Token>, ()>>(), 12);
542        assert_eq!(std::mem::size_of::<crate::prelude::Result<Token>>(), 12);
543        assert_eq!(
544            std::mem::size_of::<crate::prelude::Result<Option<Token>>>(),
545            12
546        );
547    }
548}