texlang/token/
mod.rs

1//! TeX tokens and category codes.
2
3pub mod lexer;
4pub mod trace;
5use crate::types::CatCode;
6use std::{fmt::Display, num};
7use texcraft_stdext::collections::interner;
8
9/// String type used to represent control sequence names in Texlang.
10///
11/// The implementation of this type is opaque so that it can be performance optimized
12/// without worrying about downstream consumers.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
14#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
15pub struct CsName(num::NonZeroU32);
16
17impl CsName {
18    #[inline]
19    pub fn to_usize(&self) -> usize {
20        self.0.get() as usize
21    }
22
23    pub fn try_from_usize(u: usize) -> Option<CsName> {
24        let u = match u32::try_from(u) {
25            Ok(u) => u,
26            Err(_) => return None,
27        };
28        num::NonZeroU32::new(u).map(CsName)
29    }
30}
31
32/// String interner for control sequence names.
33pub type CsNameInterner = interner::Interner<CsName>;
34
35impl interner::Key for CsName {
36    fn try_from_usize(index: usize) -> Option<Self> {
37        num::NonZeroU32::try_from_usize(index).map(CsName)
38    }
39
40    fn into_usize(self) -> usize {
41        self.0.into_usize()
42    }
43}
44
45/// The value of a token.
46#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub enum Value {
49    BeginGroup(char),
50    EndGroup(char),
51    MathShift(char),
52    AlignmentTab(char),
53    Parameter(char),
54    Superscript(char),
55    Subscript(char),
56    Space(char),
57    Letter(char),
58    Other(char),
59    CommandRef(CommandRef),
60}
61
62/// The value of a token that references a command
63#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
64#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
65pub enum CommandRef {
66    ControlSequence(CsName),
67    ActiveCharacter(char),
68}
69
70impl CommandRef {
71    pub fn to_string(&self, cs_name_interner: &CsNameInterner) -> String {
72        match self {
73            CommandRef::ControlSequence(cs_name) => {
74                format!("\\{}", cs_name_interner.resolve(*cs_name).unwrap())
75            }
76            CommandRef::ActiveCharacter(c) => format!("{c}"),
77        }
78    }
79}
80
81impl Value {
82    pub fn new(c: char, cat_code: CatCode) -> Value {
83        match cat_code {
84            CatCode::BeginGroup => Value::BeginGroup(c),
85            CatCode::EndGroup => Value::EndGroup(c),
86            CatCode::MathShift => Value::MathShift(c),
87            CatCode::AlignmentTab => Value::AlignmentTab(c),
88            CatCode::Parameter => Value::Parameter(c),
89            CatCode::Superscript => Value::Superscript(c),
90            CatCode::Subscript => Value::Subscript(c),
91            CatCode::Space => Value::Space(c),
92            CatCode::Letter => Value::Letter(c),
93            CatCode::Other => Value::Other(c),
94            CatCode::Active => Value::CommandRef(CommandRef::ActiveCharacter(c)),
95            _ => panic!("raw cat code not allowed"),
96        }
97    }
98
99    /// TODO: should have a char_and_catcode function
100    pub fn char(&self) -> Option<char> {
101        Some(self.char_and_cat_code()?.0)
102    }
103
104    pub fn cat_code(&self) -> Option<CatCode> {
105        Some(self.char_and_cat_code()?.1)
106    }
107
108    pub fn char_and_cat_code(&self) -> Option<(char, CatCode)> {
109        let (c, code) = match self {
110            Value::BeginGroup(c) => (c, CatCode::BeginGroup),
111            Value::EndGroup(c) => (c, CatCode::EndGroup),
112            Value::MathShift(c) => (c, CatCode::MathShift),
113            Value::AlignmentTab(c) => (c, CatCode::AlignmentTab),
114            Value::Parameter(c) => (c, CatCode::Parameter),
115            Value::Superscript(c) => (c, CatCode::Superscript),
116            Value::Subscript(c) => (c, CatCode::Subscript),
117            Value::Space(c) => (c, CatCode::Space),
118            Value::Letter(c) => (c, CatCode::Letter),
119            Value::Other(c) => (c, CatCode::Other),
120            Value::CommandRef(command_ref) => match command_ref {
121                CommandRef::ControlSequence(_) => return None,
122                CommandRef::ActiveCharacter(c) => (c, CatCode::Active),
123            },
124        };
125        Some((*c, code))
126    }
127}
128
129/// A TeX token.
130#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
131#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
132pub struct Token {
133    value: Value,
134    trace_key: trace::Key,
135}
136
137impl std::fmt::Display for Token {
138    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
139        match &self.value {
140            Value::CommandRef(_) => {
141                write![f, "todo"] // TODO
142            }
143            _ => {
144                write![f, "{}", self.char().unwrap()]
145            }
146        }
147    }
148}
149
150macro_rules! token_constructor {
151    ($name: ident, $value: expr) => {
152        pub fn $name(c: char, trace_key: trace::Key) -> Token {
153            Token {
154                value: $value(c),
155                trace_key,
156            }
157        }
158    };
159}
160
161impl Token {
162    token_constructor!(new_begin_group, Value::BeginGroup);
163    token_constructor!(new_end_group, Value::EndGroup);
164    token_constructor!(new_math_shift, Value::MathShift);
165    token_constructor!(new_alignment_tab, Value::AlignmentTab);
166    token_constructor!(new_parameter, Value::Parameter);
167    token_constructor!(new_superscript, Value::Superscript);
168    token_constructor!(new_subscript, Value::Subscript);
169    token_constructor!(new_space, Value::Space);
170    token_constructor!(new_letter, Value::Letter);
171    token_constructor!(new_other, Value::Other);
172
173    pub fn new_command_ref(command_ref: CommandRef, trace_key: trace::Key) -> Token {
174        Token {
175            value: Value::CommandRef(command_ref),
176            trace_key,
177        }
178    }
179
180    pub fn new_active_character(c: char, trace_key: trace::Key) -> Token {
181        Token {
182            value: Value::CommandRef(CommandRef::ActiveCharacter(c)),
183            trace_key,
184        }
185    }
186
187    pub fn new_control_sequence(name: CsName, trace_key: trace::Key) -> Token {
188        Token {
189            value: Value::CommandRef(CommandRef::ControlSequence(name)),
190            trace_key,
191        }
192    }
193
194    pub fn new_from_value(value: Value, trace_key: trace::Key) -> Token {
195        Token { value, trace_key }
196    }
197
198    #[inline]
199    pub fn value(&self) -> Value {
200        self.value
201    }
202
203    #[inline]
204    pub fn trace_key(&self) -> trace::Key {
205        self.trace_key
206    }
207
208    pub fn char(&self) -> Option<char> {
209        self.value.char()
210    }
211
212    pub fn cat_code(&self) -> Option<CatCode> {
213        self.value.cat_code()
214    }
215    pub fn char_and_cat_code(&self) -> Option<(char, CatCode)> {
216        self.value.char_and_cat_code()
217    }
218}
219
220#[derive(Default)]
221enum PendingWhitespace {
222    #[default]
223    NotStarted,
224    None,
225    Space,
226    Newlines(usize),
227}
228
229impl PendingWhitespace {
230    fn reset(&mut self) {
231        *self = PendingWhitespace::None;
232    }
233
234    fn add_space(&mut self) {
235        *self = match self {
236            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
237            PendingWhitespace::None => PendingWhitespace::Space,
238            PendingWhitespace::Space => PendingWhitespace::Space,
239            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n),
240        }
241    }
242
243    fn add_newline(&mut self) {
244        *self = match self {
245            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
246            PendingWhitespace::None => PendingWhitespace::Newlines(1),
247            PendingWhitespace::Space => PendingWhitespace::Newlines(1),
248            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n + 1),
249        }
250    }
251
252    fn new_paragraph(&mut self) {
253        *self = match self {
254            PendingWhitespace::NotStarted => PendingWhitespace::NotStarted,
255            PendingWhitespace::None | PendingWhitespace::Space | PendingWhitespace::Newlines(1) => {
256                PendingWhitespace::Newlines(2)
257            }
258            PendingWhitespace::Newlines(n) => PendingWhitespace::Newlines(*n),
259        }
260    }
261}
262
263impl Display for PendingWhitespace {
264    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
265        match self {
266            PendingWhitespace::NotStarted | PendingWhitespace::None => Ok(()),
267            PendingWhitespace::Space => {
268                write!(f, " ")
269            }
270            PendingWhitespace::Newlines(n) => {
271                for _ in 0..*n {
272                    writeln!(f)?;
273                }
274                Ok(())
275            }
276        }
277    }
278}
279
280/// Data structure for writing tokens
281#[derive(Default)]
282pub struct Writer {
283    pending_whitespace: PendingWhitespace,
284}
285
286impl Writer {
287    /// Write a token.
288    pub fn write(
289        &mut self,
290        io_writer: &mut dyn std::io::Write,
291        interner: &CsNameInterner,
292        value: Value,
293    ) -> Result<(), std::io::Error> {
294        match &value {
295            Value::CommandRef(CommandRef::ControlSequence(s)) => {
296                write!(
297                    io_writer,
298                    "{}\\{}",
299                    self.pending_whitespace,
300                    interner.resolve(*s).unwrap()
301                )?;
302                self.pending_whitespace.reset();
303            }
304            Value::Space(_) => self.pending_whitespace.add_space(),
305            _ => {
306                write!(
307                    io_writer,
308                    "{}{}",
309                    self.pending_whitespace,
310                    value.char().unwrap()
311                )?;
312                self.pending_whitespace.reset();
313            }
314        }
315        io_writer.flush()
316    }
317
318    pub fn add_newline(&mut self) {
319        self.pending_whitespace.add_newline();
320    }
321
322    pub fn start_paragraph(&mut self) {
323        self.pending_whitespace.new_paragraph();
324    }
325}
326
327/// Write a collection of tokens to a string.
328pub fn write_tokens<'a, T>(tokens: T, interner: &CsNameInterner) -> String
329where
330    T: IntoIterator<Item = &'a Token>,
331{
332    let mut buffer: Vec<u8> = Default::default();
333    let mut writer: Writer = Default::default();
334    for token in tokens.into_iter() {
335        writer.write(&mut buffer, interner, token.value()).unwrap();
336    }
337    std::str::from_utf8(&buffer).unwrap().into()
338}
339
340/// Write a collection of token values to a string.
341pub fn write_token_values<'a, T>(values: T, interner: &CsNameInterner) -> String
342where
343    T: IntoIterator<Item = &'a Value>,
344{
345    let mut buffer: Vec<u8> = Default::default();
346    let mut writer: Writer = Default::default();
347    for value in values.into_iter() {
348        writer.write(&mut buffer, interner, *value).unwrap();
349    }
350    std::str::from_utf8(&buffer).unwrap().into()
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356
357    enum Instruction {
358        ControlSequence(&'static str),
359        Character(char, CatCode),
360        Newline,
361        NewParagraph,
362    }
363
364    fn writer_test(input: Vec<Instruction>, want: &str) {
365        let mut buffer: Vec<u8> = Default::default();
366        let mut writer: Writer = Default::default();
367        let mut interner = CsNameInterner::default();
368        for pre_interned_token in input {
369            match pre_interned_token {
370                Instruction::ControlSequence(name) => {
371                    let cs_name = interner.get_or_intern(name);
372                    let token = Token::new_control_sequence(cs_name, trace::Key::dummy());
373                    writer.write(&mut buffer, &interner, token.value()).unwrap();
374                }
375                Instruction::Character(c, code) => {
376                    let token = Token::new_from_value(Value::new(c, code), trace::Key::dummy());
377                    writer.write(&mut buffer, &interner, token.value()).unwrap();
378                }
379                Instruction::Newline => {
380                    writer.add_newline();
381                }
382                Instruction::NewParagraph => {
383                    writer.start_paragraph();
384                }
385            };
386        }
387        let got: String = std::str::from_utf8(&buffer).unwrap().into();
388        let want = want.to_string();
389
390        if got != want {
391            println!("Output is different:");
392            println!("------[got]-------");
393            println!("{}", got);
394            println!("------[want]------");
395            println!("{}", want);
396            println!("-----------------");
397            panic!("write_tokens test failed");
398        }
399    }
400
401    macro_rules! write_tokens_tests {
402        ($( ($name: ident, $input: expr, $want: expr), )+) => {
403            $(
404            #[test]
405            fn $name() {
406                writer_test($input, $want);
407            }
408            )+
409        };
410    }
411
412    write_tokens_tests!(
413        (blank, vec!(), ""),
414        (
415            trim_whitespace_from_start,
416            vec![
417                Instruction::Character('\n', CatCode::Space),
418                Instruction::Character('\n', CatCode::Space),
419                Instruction::Character('\n', CatCode::Space),
420                Instruction::Character('H', CatCode::Letter),
421            ],
422            "H"
423        ),
424        (
425            trim_whitespace_from_end,
426            vec![
427                Instruction::Character('H', CatCode::Letter),
428                Instruction::Character('\n', CatCode::Space),
429                Instruction::Character('\n', CatCode::Space),
430                Instruction::Character('\n', CatCode::Space),
431            ],
432            "H"
433        ),
434        (
435            trim_whitespace_from_middle_1,
436            vec![
437                Instruction::Character('H', CatCode::Letter),
438                Instruction::Character(' ', CatCode::Space),
439                Instruction::Character(' ', CatCode::Space),
440                Instruction::Character('W', CatCode::Letter),
441            ],
442            "H W"
443        ),
444        (
445            trim_whitespace_from_middle_2,
446            vec![
447                Instruction::Character('H', CatCode::Letter),
448                Instruction::Character('\n', CatCode::Space),
449                Instruction::Character(' ', CatCode::Space),
450                Instruction::Character('\n', CatCode::Space),
451                Instruction::Character('W', CatCode::Letter),
452            ],
453            "H W"
454        ),
455        (
456            trim_whitespace_from_middle_3,
457            vec![
458                Instruction::Character('H', CatCode::Letter),
459                Instruction::Character('\n', CatCode::Space),
460                Instruction::Character('\n', CatCode::Space),
461                Instruction::Character('\n', CatCode::Space),
462                Instruction::Character('W', CatCode::Letter),
463            ],
464            "H W"
465        ),
466        (
467            control_sequence,
468            vec![Instruction::ControlSequence("HelloWorld"),],
469            "\\HelloWorld"
470        ),
471        (
472            newline_1,
473            vec![
474                Instruction::Character('H', CatCode::Letter),
475                Instruction::Newline,
476                Instruction::Character('W', CatCode::Letter),
477            ],
478            "H\nW"
479        ),
480        (
481            newline_2,
482            vec![
483                Instruction::Character('H', CatCode::Letter),
484                Instruction::Newline,
485                Instruction::Character(' ', CatCode::Space),
486                Instruction::Newline,
487                Instruction::Character('W', CatCode::Letter),
488            ],
489            "H\n\nW"
490        ),
491        (
492            newline_3,
493            vec![
494                Instruction::Character('H', CatCode::Letter),
495                Instruction::Newline,
496                Instruction::Character(' ', CatCode::Space),
497                Instruction::Newline,
498                Instruction::Character(' ', CatCode::Space),
499                Instruction::Newline,
500                Instruction::Character('W', CatCode::Letter),
501            ],
502            "H\n\n\nW"
503        ),
504        (
505            par_1,
506            vec![
507                Instruction::Character('H', CatCode::Letter),
508                Instruction::NewParagraph,
509                Instruction::NewParagraph,
510                Instruction::Character('W', CatCode::Letter),
511            ],
512            "H\n\nW"
513        ),
514        (
515            par_2,
516            vec![
517                Instruction::Character('H', CatCode::Letter),
518                Instruction::NewParagraph,
519                Instruction::NewParagraph,
520                Instruction::NewParagraph,
521                Instruction::Character('W', CatCode::Letter),
522            ],
523            "H\n\nW"
524        ),
525    );
526
527    #[test]
528    fn token_size() {
529        assert_eq!(std::mem::size_of::<CommandRef>(), 8);
530        assert_eq!(std::mem::size_of::<Value>(), 8);
531        assert_eq!(std::mem::size_of::<Token>(), 12);
532        assert_eq!(std::mem::size_of::<Result<Token, ()>>(), 12);
533        assert_eq!(std::mem::size_of::<Result<Option<Token>, ()>>(), 12);
534        assert_eq!(std::mem::size_of::<crate::prelude::Result<Token>>(), 12);
535        assert_eq!(
536            std::mem::size_of::<crate::prelude::Result<Option<Token>>>(),
537            12
538        );
539    }
540}