texlang/types/
catcode.rs

1use CatCode::*;
2
3/// Enum representing all 16 category codes in TeX.
4///
5/// Each variant's documentation contains an example character which is mapped to that category code in plainTeX.
6#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
8pub enum CatCode {
9    /// Marks the beginning of a control sequence.
10    /// Example: `\`.
11    ///
12    /// This category code is never seen outside of the lexer.
13    Escape = 0,
14    /// Begins a new group.
15    /// Example: `{`.
16    BeginGroup = 1,
17    /// Ends an existing new group.
18    /// Example: `}`.
19    EndGroup = 2,
20    /// Starts or ends math mode.
21    /// Example: `$`.
22    MathShift = 3,
23    /// Used in typesetting tables to align cells.
24    /// Example: `&`.
25    AlignmentTab = 4,
26    /// Marks a new line in the input.
27    /// Example: `\n`.
28    ///
29    /// This code behaves similarly to [Space], but has two additional properties.
30    /// First, two or more consecutive new lines, modulo intervening [Space] characters, create a `\par` control sequence
31    ///     instead of a regular space.
32    /// Second, this code terminates a comment that started with a [Comment] character.
33    ///
34    /// This category code is never seen outside of the lexer.
35    EndOfLine = 5,
36    /// Marks the beginning of a parameter number.
37    /// It must generally be followed by a digit.
38    /// Example: `#`.
39    Parameter = 6,
40    /// Puts following character or group in a superscript.
41    /// Example: `^`.
42    Superscript = 7,
43    /// Puts following character or group in a subscript.
44    /// Example: `_`.
45    Subscript = 8,
46    /// Character that is ignored by the lexer.
47    /// Example: ASCII null (0).
48    ///
49    /// This category code is never seen outside of the lexer.
50    Ignored = 9,
51    /// Whitespace. Example: ` `.
52    Space = 10,
53    /// A character that can be used as a control sequence name.
54    /// Examples: `[a-zA-z]`.
55    Letter = 11,
56    /// A character than cannot be used as a control sequence name.
57    /// Example: `@`.
58    #[default]
59    Other = 12,
60    /// A single character that behaves like a control sequence.
61    /// Example: `~`.
62    Active = 13,
63    /// Marks the beginning of a comment.
64    /// All characters until the next [EndOfLine] are ignored.
65    /// Example: `%`.
66    ///
67    /// This category code is never seen outside of the lexer.
68    Comment = 14,
69    /// An invalid character.
70    /// If this is encountered in the input, the lexer will return an error.
71    /// Example: ASCII delete (127).
72    ///
73    /// This category code is never seen outside of the lexer.
74    Invalid = 15,
75}
76
77impl TryFrom<u8> for CatCode {
78    type Error = ();
79
80    fn try_from(value: u8) -> Result<Self, Self::Error> {
81        match value {
82            0 => Ok(Escape),
83            1 => Ok(BeginGroup),
84            2 => Ok(EndGroup),
85            3 => Ok(MathShift),
86            4 => Ok(AlignmentTab),
87            5 => Ok(EndOfLine),
88            6 => Ok(Parameter),
89            7 => Ok(Superscript),
90            8 => Ok(Subscript),
91            9 => Ok(Ignored),
92            10 => Ok(Space),
93            11 => Ok(Letter),
94            12 => Ok(Other),
95            13 => Ok(Active),
96            14 => Ok(Comment),
97            15 => Ok(Invalid),
98            _ => Err(()),
99        }
100    }
101}
102
103impl std::fmt::Display for CatCode {
104    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
105        write!(
106            f,
107            "{} ({})",
108            *self as u8,
109            match self {
110                Escape => "escape",
111                BeginGroup => "begin group",
112                EndGroup => "end group",
113                MathShift => "math shift",
114                AlignmentTab => "alignment tab",
115                EndOfLine => "end of line",
116                Parameter => "parameter",
117                Superscript => "superscript",
118                Subscript => "subscript",
119                Ignored => "ignored",
120                Space => "space",
121                Letter => "letter",
122                Other => "other",
123                Active => "active",
124                Comment => "comment",
125                Invalid => "invalid",
126            }
127        )?;
128        Ok(())
129    }
130}
131
132impl CatCode {
133    /// Default category codes in INITEX for all ASCII characters.
134    ///
135    /// To find the category code for an ASCII character,
136    ///     convert it to an integer and use it as an index for the array.
137    ///
138    /// This list was compiled by reading the source code TeX '82,
139    ///     specifically section 232 in the "TeX: the program".
140    /// These defaults are also described in the TeXBook p343.
141    pub const INITEX_DEFAULTS: [CatCode; 128] = [
142        Ignored, // ASCII null
143        Other, Other, Other, Other, Other, Other, Other, Other, Other,
144        EndOfLine, // carriage return
145        Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other,
146        Other, Other, Other, Other, Other, Other, Other, Other, Space, // space
147        Other, Other, Other, Other, Comment, // %
148        Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other,
149        Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other, Other,
150        Other, Letter, // A
151        Letter, // B
152        Letter, // C
153        Letter, // D
154        Letter, // E
155        Letter, // F
156        Letter, // G
157        Letter, // H
158        Letter, // I
159        Letter, // J
160        Letter, // K
161        Letter, // L
162        Letter, // M
163        Letter, // N
164        Letter, // O
165        Letter, // P
166        Letter, // Q
167        Letter, // R
168        Letter, // S
169        Letter, // T
170        Letter, // U
171        Letter, // V
172        Letter, // W
173        Letter, // X
174        Letter, // Y
175        Letter, // Z
176        Other, Escape, // \
177        Other, Other, Other, Other, Letter, // a
178        Letter, // b
179        Letter, // c
180        Letter, // d
181        Letter, // e
182        Letter, // f
183        Letter, // g
184        Letter, // h
185        Letter, // i
186        Letter, // j
187        Letter, // k
188        Letter, // l
189        Letter, // m
190        Letter, // n
191        Letter, // o
192        Letter, // p
193        Letter, // q
194        Letter, // r
195        Letter, // s
196        Letter, // t
197        Letter, // u
198        Letter, // v
199        Letter, // w
200        Letter, // x
201        Letter, // y
202        Letter, // z
203        Other, Other, Other, Other, Invalid, // ASCII delete
204    ];
205
206    /// Default category codes in plainTeX for all ASCII characters.
207    ///
208    /// To find the category code for an ASCII character,
209    ///     convert it to an integer and use it as an index for the array.
210    ///
211    /// This list was compiled by starting with [INITEX_DEFAULTS](CatCode::INITEX_DEFAULTS) and then applying
212    ///     all category code changes in the plainTeX format.
213    /// These changes are described on p343 of the TeXBook.
214    pub const PLAIN_TEX_DEFAULTS: [CatCode; 128] = [
215        Ignored, // ASCII null
216        Other,
217        Other,
218        Other,
219        Other,
220        Other,
221        Other,
222        Other,
223        Other,
224        Space,     // horizontal tab (\t)
225        EndOfLine, // line feed (\n)
226        Other,
227        Active,    // ASCII form-feed
228        EndOfLine, // carriage return (\r)
229        Other,
230        Other,
231        Other,
232        Other,
233        Other,
234        Other,
235        Other,
236        Other,
237        Other,
238        Other,
239        Other,
240        Other,
241        Other,
242        Other,
243        Other,
244        Other,
245        Other,
246        Other,
247        Space, // space
248        Other,
249        Other,
250        Parameter,    // #
251        MathShift,    // $
252        Comment,      // %
253        AlignmentTab, // &
254        Other,
255        Other,
256        Other,
257        Other,
258        Other,
259        Other,
260        Other,
261        Other,
262        Other,
263        Other,
264        Other,
265        Other,
266        Other,
267        Other,
268        Other,
269        Other,
270        Other,
271        Other,
272        Other,
273        Other,
274        Other,
275        Other,
276        Other,
277        Other,
278        Other,
279        Other,
280        Letter, // A
281        Letter, // B
282        Letter, // C
283        Letter, // D
284        Letter, // E
285        Letter, // F
286        Letter, // G
287        Letter, // H
288        Letter, // I
289        Letter, // J
290        Letter, // K
291        Letter, // L
292        Letter, // M
293        Letter, // N
294        Letter, // O
295        Letter, // P
296        Letter, // Q
297        Letter, // R
298        Letter, // S
299        Letter, // T
300        Letter, // U
301        Letter, // V
302        Letter, // W
303        Letter, // X
304        Letter, // Y
305        Letter, // Z
306        Other,
307        Escape, // \
308        Other,
309        Superscript, // ^
310        Subscript,   // _
311        Other,
312        Letter,     // a
313        Letter,     // b
314        Letter,     // c
315        Letter,     // d
316        Letter,     // e
317        Letter,     // f
318        Letter,     // g
319        Letter,     // h
320        Letter,     // i
321        Letter,     // j
322        Letter,     // k
323        Letter,     // l
324        Letter,     // m
325        Letter,     // n
326        Letter,     // o
327        Letter,     // p
328        Letter,     // q
329        Letter,     // r
330        Letter,     // s
331        Letter,     // t
332        Letter,     // u
333        Letter,     // v
334        Letter,     // w
335        Letter,     // x
336        Letter,     // y
337        Letter,     // z
338        BeginGroup, // {
339        Other,
340        EndGroup, // }
341        Active,   // ~
342        Invalid,  // ASCII delete
343    ];
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn serialize_and_deserialize_cat_code() {
352        let all_raw_cat_codes = vec![
353            CatCode::BeginGroup,
354            CatCode::EndGroup,
355            CatCode::MathShift,
356            CatCode::AlignmentTab,
357            CatCode::Parameter,
358            CatCode::Superscript,
359            CatCode::Subscript,
360            CatCode::Space,
361            CatCode::Letter,
362            CatCode::Other,
363            CatCode::Active,
364            CatCode::Escape,
365            CatCode::EndOfLine,
366            CatCode::Ignored,
367            CatCode::Comment,
368            CatCode::Invalid,
369        ];
370        for cat_code in all_raw_cat_codes {
371            let u: u8 = cat_code as u8;
372            let recovered: CatCode = u.try_into().unwrap();
373            assert_eq!(recovered, cat_code);
374        }
375    }
376}