texlang_texttransform/
lib.rs

1//! Implementation of TeX primitives relating to text transforming (hyphenation, upper and lower casing).
2//!
3//! The theme of the primitives here (`\lccode`, `\lowercase`, `\patterns`) is that
4//! they're all related to "tex transformation" (case change, hyphenation).
5//! However the real reason they're grouped is that the hyphenator hyphenates based on
6//! each character's lower case, and so has a dependency on `\lccode` which defines
7//! these cases.
8//!
9//! This crate is also a good experiment in how defining registers works outside of the
10//! standard lib. At time of writing (May 2026) I can say that it's not really great.
11//! The problem is that having the exact component specified is a but too rigid.
12//! For example, I might like multiple registers in the same component.
13
14use texlang::prelude as txl;
15use texlang::token::{Token, Value};
16use texlang::traits::*;
17use texlang::*;
18use texlang_stdlib::registers;
19
20/// Component that holds the 256 `\lccode` values.
21///
22/// Each entry is the lowercase character code for the corresponding character (0–255).
23/// Although the values are semantically character codes (0–255), the storage type is
24/// `i32` because that is the integer type supported by the Texlang variable system.
25pub type LcCodeComponent = registers::Component<u8, 256, LcCodeMarker>;
26
27pub struct LcCodeMarker;
28
29/// Get the `\lccode` command.
30pub fn get_lccode<S: HasComponent<LcCodeComponent>>() -> command::BuiltIn<S> {
31    registers::new_registers_command()
32}
33
34/// Component that holds the 256 `\uccode` values.
35///
36/// Each entry is the uppercase character code for the corresponding character (0–255).
37pub type UcCodeComponent = registers::Component<u8, 256, UcCodeMarker>;
38
39pub struct UcCodeMarker;
40
41/// Get the `\uccode` command.
42pub fn get_uccode<S: HasComponent<UcCodeComponent>>() -> command::BuiltIn<S> {
43    registers::new_registers_command()
44}
45
46/// Get the `\lowercase` expansion primitive.
47pub fn get_lowercase<S: HasComponent<LcCodeComponent>>() -> command::BuiltIn<S> {
48    command::BuiltIn::new_expansion(lowercase_primitive_fn)
49}
50
51fn lowercase_primitive_fn<S: HasComponent<LcCodeComponent>>(
52    _: token::Token,
53    input: &mut vm::ExpansionInput<S>,
54) -> txl::Result<()> {
55    // TeX.2021.1288
56    let mut list = texlang::parse::UnexpandedTokenList::parse(input)?.0;
57    transform_list(&mut list, input.state().component().values());
58    input.push_expansion(&list);
59    input.return_token_buffer(list);
60    Ok(())
61}
62
63/// Get the `\uppercase` expansion primitive.
64pub fn get_uppercase<S: HasComponent<UcCodeComponent>>() -> command::BuiltIn<S> {
65    command::BuiltIn::new_expansion(uppercase_primitive_fn)
66}
67
68fn uppercase_primitive_fn<S: HasComponent<UcCodeComponent>>(
69    _: token::Token,
70    input: &mut vm::ExpansionInput<S>,
71) -> txl::Result<()> {
72    // TeX.2021.1288
73    let mut list = texlang::parse::UnexpandedTokenList::parse(input)?.0;
74    transform_list(&mut list, input.state().component().values());
75    input.push_expansion(&list);
76    input.return_token_buffer(list);
77    Ok(())
78}
79
80fn transform_list(list: &mut [token::Token], values: &[u8]) {
81    for token in list {
82        let Some((c, cat_code)) = token.char_and_cat_code() else {
83            continue;
84        };
85        let Some(new_c_raw) = values.get(c as usize) else {
86            continue;
87        };
88        if *new_c_raw == 0 {
89            continue;
90        }
91        let new_c = char::from(*new_c_raw);
92        *token = Token::new_from_value(Value::new(new_c, cat_code), token.trace_key())
93    }
94}
95
96#[derive(Default)]
97pub struct HyphenationComponent {
98    hyphenator: hyphenate::Hyphenator,
99}
100
101/// Get the `\patterns` primitive.
102pub fn get_patterns<S: HasComponent<HyphenationComponent>>() -> command::BuiltIn<S> {
103    command::BuiltIn::new_execution(patterns_primitive_fn)
104}
105
106fn patterns_primitive_fn<S: HasComponent<HyphenationComponent>>(
107    _: token::Token,
108    input: &mut vm::ExecutionInput<S>,
109) -> txl::Result<()> {
110    // TeX.2021.961
111    texlang::parse::LeftBrace::parse(input)?;
112    let mut s = String::new();
113    loop {
114        let token = input.next_or_err(EndOfInputError {})?;
115        use Value::*;
116        match token.value() {
117            EndGroup(_) => {
118                break;
119            }
120            Space(_) => {
121                s.push(' ');
122            }
123            Letter(c) | Other(c) => {
124                s.push(c);
125            }
126            _ => {
127                // TODO: error
128            }
129        }
130    }
131    input
132        .state_mut()
133        .component_mut()
134        .hyphenator
135        .load_patterns(&s);
136    Ok(())
137}
138
139/// Get the `\hyphenation` primitive.
140pub fn get_hyphenation<S: HasComponent<HyphenationComponent>>() -> command::BuiltIn<S> {
141    command::BuiltIn::new_execution(hyphenation_primitive_fn)
142}
143
144fn hyphenation_primitive_fn<S: HasComponent<HyphenationComponent>>(
145    _: token::Token,
146    input: &mut vm::ExecutionInput<S>,
147) -> txl::Result<()> {
148    // TeX.2021.935
149    texlang::parse::LeftBrace::parse(input)?;
150    let mut s = String::new();
151    loop {
152        let token = input.next_or_err(EndOfInputError {})?;
153        use Value::*;
154        match token.value() {
155            EndGroup(_) => {
156                break;
157            }
158            Space(_) => {
159                s.push(' ');
160            }
161            Letter(c) | Other(c) => {
162                // TODO: need to check the \lccode is valid
163                // as in TeX.2021.937.
164                s.push(c);
165            }
166            _ => {
167                // TODO: error and handle other cases. TeX is pretty flexible in
168                // handling exceptions here.
169            }
170        }
171    }
172    input
173        .state_mut()
174        .component_mut()
175        .hyphenator
176        .insert_exception(&s);
177    Ok(())
178}
179
180#[derive(Debug)]
181struct EndOfInputError;
182
183impl error::EndOfInputError for EndOfInputError {
184    fn doing(&self) -> String {
185        r"determining the argument to \pattern or \hyphenation".into()
186    }
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192    use std::collections::HashMap;
193    use texlang::vm::implement_has_component;
194    use texlang_stdlib::{prefix, the};
195    use texlang_testing::*;
196
197    #[derive(Default)]
198    struct State {
199        lccode: LcCodeComponent,
200        uccode: UcCodeComponent,
201        prefix: prefix::Component,
202        testing: TestingComponent,
203    }
204
205    impl TexlangState for State {
206        fn recoverable_error_hook(
207            &self,
208            recoverable_error: error::TracedTexError,
209        ) -> Result<(), Box<dyn error::TexError>> {
210            TestingComponent::recoverable_error_hook(self, recoverable_error)
211        }
212        fn variable_assignment_scope_hook(
213            state: &mut Self,
214        ) -> texcraft_stdext::collections::groupingmap::Scope {
215            prefix::variable_assignment_scope_hook(state)
216        }
217    }
218    impl the::TheCompatible for State {}
219
220    implement_has_component![State {
221        lccode: LcCodeComponent,
222        uccode: UcCodeComponent,
223        prefix: prefix::Component,
224        testing: TestingComponent,
225    }];
226
227    fn built_in_commands() -> HashMap<&'static str, command::BuiltIn<State>> {
228        HashMap::from([
229            ("the", the::get_the()),
230            ("def", texlang_stdlib::def::get_def()),
231            ("lccode", get_lccode()),
232            ("uccode", get_uccode()),
233            ("lowercase", get_lowercase()),
234            ("uppercase", get_uppercase()),
235            ("global", prefix::get_global()),
236        ])
237    }
238
239    test_suite![expansion_equality_tests(
240        (
241            lccode_write_and_read,
242            r"\lccode 65 97 \the\lccode 65",
243            r"97"
244        ),
245        (
246            uccode_write_and_read,
247            r"\uccode 97 65 \the\uccode 97",
248            r"65"
249        ),
250        (lccode_default, r"\the\lccode 65", r"0"),
251        (uccode_default, r"\the\uccode 97", r"0"),
252        (
253            lccode_grouping,
254            r"{\lccode 65 97 \the\lccode 65}-\the\lccode 65",
255            r"97-0"
256        ),
257        (
258            uccode_grouping,
259            r"{\uccode 97 65 \the\uccode 97}-\the\uccode 97",
260            r"65-0"
261        ),
262        // \uppercase tests
263        // uccode 0 means leave the character unchanged
264        (uppercase_zero_uccode, r"\uppercase{a}", r"a"),
265        // basic single character substitution: 'a'(97) -> 'A'(65)
266        (uppercase_single_char, r"\uccode 97 65 \uppercase{a}", r"A"),
267        // multiple characters
268        (
269            uppercase_multiple_chars,
270            r"\uccode 97 65 \uccode 98 66 \uppercase{ab}",
271            r"AB"
272        ),
273        // characters with uccode=0 are left unchanged
274        (uppercase_mixed, r"\uccode 97 65 \uppercase{a1}", r"A1"),
275        // uccode mapping is read at expansion time, so changes inside the group don't apply
276        (
277            uppercase_uses_unexpanded_stream,
278            r"\def\a{a}\uccode 97 65 \uppercase{\a}",
279            r"a"
280        ),
281        // \lowercase tests
282        // lccode 0 means leave the character unchanged
283        (lowercase_zero_lccode, r"\lowercase{A}", r"A"),
284        // basic single character substitution: 'A'(65) -> 'a'(97)
285        (lowercase_single_char, r"\lccode 65 97 \lowercase{A}", r"a"),
286        // multiple characters
287        (
288            lowercase_multiple_chars,
289            r"\lccode 65 97 \lccode 66 98 \lowercase{AB}",
290            r"ab"
291        ),
292        // characters with lccode=0 are left unchanged
293        (lowercase_mixed, r"\lccode 65 97 \lowercase{A1}", r"a1"),
294        // lccode mapping is read at expansion time, so changes inside the group don't apply
295        (
296            lowercase_uses_unexpanded_stream,
297            r"\def\A{A}\lccode 65 97 \lowercase{\A}",
298            r"A"
299        ),
300        // \uppercase and \lowercase don't interfere with each other
301        (
302            uppercase_unaffected_by_lccode,
303            r"\lccode 65 97 \uccode 97 65 \uppercase{aA}",
304            r"AA"
305        ),
306        (
307            lowercase_unaffected_by_uccode,
308            r"\uccode 97 65 \lccode 65 97 \lowercase{AA}",
309            r"aa"
310        ),
311    ),];
312}