boxworks_text/
lib.rs

1//! # Boxworks text preprocessor
2//!
3//! This crate implements the logic that converts text (words and spaces)
4//! into horizontal list elements.
5//! It is implemented in the Chief Executive chapter in Knuth's
6//! TeX (starting in TeX.2021.1029).
7
8use boxworks::ds;
9use std::collections::HashMap;
10use tfm::ligkern;
11
12#[derive(Debug)]
13struct Font {
14    default_space: common::Glue,
15    extra_space: common::Scaled,
16    lig_kern_program: tfm::ligkern::CompiledProgram,
17}
18
19#[derive(Default)]
20pub struct TextPreprocessorImpl {
21    fonts: Vec<Font>,
22    // TODO: should be initialized to the null font
23    // TODO: should current_font be some kind of specific font identifier type.
24    current_font: u32,
25    space_factor: SpaceFactor,
26    pub space_factor_codes: SpaceFactorCodes,
27}
28
29pub struct SpaceFactorCodes(pub [i32; 256]);
30
31impl Default for SpaceFactorCodes {
32    fn default() -> Self {
33        Self::plain_tex_defaults()
34    }
35}
36
37impl SpaceFactorCodes {
38    pub fn plain_tex_defaults() -> Self {
39        let mut a = [1000_i32; 256];
40        for (c, value) in [
41            // From plain.tex
42            (')', 0),
43            ('\'', 0),
44            (']', 0),
45            // From \nonfrenchspacing in plain.tex
46            ('.', 3000),
47            ('?', 3000),
48            ('!', 3000),
49            (':', 2000),
50            (';', 1500),
51            (',', 1250),
52        ] {
53            a[c as usize] = value;
54        }
55        for c in 'A'..='Z' {
56            // INITTEX
57            a[c as usize] = 999;
58        }
59        Self(a)
60    }
61}
62
63#[derive(Debug, PartialEq, Eq, Clone, Copy)]
64pub struct SpaceFactor(pub i32);
65
66impl Default for SpaceFactor {
67    fn default() -> Self {
68        Self(1000)
69    }
70}
71
72impl SpaceFactor {
73    fn adjust(&mut self, c: char, codes: &SpaceFactorCodes) {
74        // TeX.2021.1034
75        let new: i32 = codes.0.get(c as usize).copied().unwrap_or(1000);
76        if new > 0 && new <= 1000 {
77            self.0 = new;
78        } else if new > 1000 {
79            if self.0 < 1000 {
80                self.0 = 1000
81            } else {
82                self.0 = new
83            }
84        }
85    }
86}
87
88impl TextPreprocessorImpl {
89    pub fn activate_font(&mut self, font: u32) {
90        self.current_font = font;
91    }
92}
93
94impl boxworks::TextPreprocessor for TextPreprocessorImpl {
95    fn new_paragraph(&mut self) {
96        self.space_factor = Default::default();
97    }
98
99    fn add_word(&mut self, word: &str, list: &mut Vec<ds::Horizontal>) {
100        let font = &self.fonts[self.current_font as usize];
101
102        struct Emitter<'a>(&'a mut Vec<ds::Horizontal>, u32);
103        impl<'a> ligkern::Emitter for Emitter<'a> {
104            fn emit_character(&mut self, c: char) {
105                self.0.push(
106                    ds::Char {
107                        char: c,
108                        font: self.1,
109                    }
110                    .into(),
111                );
112                // TeX.2021.1035
113                // TODO: \hyphenchar
114                if c == '-' {
115                    self.0.push(ds::Discretionary::default().into());
116                }
117            }
118            fn emit_kern(&mut self, kern: common::Scaled) {
119                self.0.push(
120                    ds::Kern {
121                        width: kern,
122                        kind: ds::KernKind::Normal,
123                    }
124                    .into(),
125                );
126            }
127            fn emit_ligature(&mut self, ligature: ligkern::Ligature) {
128                let ins_disc = ligature.original.as_ref().ends_with('-');
129                self.0.push(
130                    ds::Ligature {
131                        included_left_boundary: false,
132                        included_right_boundary: false,
133                        char: ligature.c,
134                        font: self.1,
135                        original_chars: ligature.original,
136                    }
137                    .into(),
138                );
139                // TeX.2021.1035
140                // TODO: \hyphenchar
141                if ins_disc {
142                    self.0.push(ds::Discretionary::default().into());
143                }
144            }
145        }
146
147        let mut e = Emitter(list, self.current_font);
148        font.lig_kern_program.run(word, &mut e);
149        // TODO: consider merging this loop with the loop in the lig/kern program.
150        // We can change the run method to accept a callback that is invoked for
151        // each character.
152        for c in word.chars() {
153            self.space_factor.adjust(c, &self.space_factor_codes);
154        }
155    }
156
157    fn add_space(&mut self, list: &mut Vec<ds::Horizontal>) {
158        let mut g = self.fonts[self.current_font as usize].default_space;
159        let g = if self.space_factor == SpaceFactor::default() {
160            // TeX.2021.1041
161            // TODO: implement \spaceskip.
162            g
163        } else {
164            // TeX.2021.1043
165            // TODO: implement "xspace skip" and \spaceskip
166            if self.space_factor.0 >= 2000 {
167                g.width += self.fonts[self.current_font as usize].extra_space;
168            }
169            g.stretch = g.stretch.xn_over_d(self.space_factor.0, 1000).unwrap().0;
170            g.shrink = g.shrink.xn_over_d(1000, self.space_factor.0).unwrap().0;
171            g
172        };
173        list.push(ds::Horizontal::Glue(g.into()));
174    }
175}
176
177impl TextPreprocessorImpl {
178    pub fn register_font(
179        &mut self,
180        id: u32,
181        tfm_file: &tfm::File,
182        lig_kern_program: tfm::ligkern::CompiledProgram,
183    ) {
184        assert_eq!(id as usize, self.fonts.len());
185        self.fonts.push(Font {
186            default_space: common::Glue {
187                width: tfm_file
188                    .named_param_scaled(tfm::NamedParameter::Space)
189                    .unwrap(),
190                stretch: tfm_file
191                    .named_param_scaled(tfm::NamedParameter::Stretch)
192                    .unwrap(),
193                stretch_order: common::GlueOrder::Normal,
194                shrink: tfm_file
195                    .named_param_scaled(tfm::NamedParameter::Shrink)
196                    .unwrap(),
197                shrink_order: common::GlueOrder::Normal,
198            },
199            extra_space: tfm_file
200                .named_param_scaled(tfm::NamedParameter::ExtraSpace)
201                .unwrap(),
202            lig_kern_program,
203        });
204    }
205}
206
207#[derive(Debug, Default)]
208pub struct TfmFontRepo {
209    fonts: HashMap<u32, tfm::File>,
210}
211
212impl TfmFontRepo {
213    pub fn register_font(&mut self, id: u32, tfm_file: tfm::File) {
214        assert_eq!(id as usize, self.fonts.len());
215        self.fonts.insert(id, tfm_file);
216    }
217}
218
219impl boxworks::FontRepo for TfmFontRepo {
220    fn width(&self, c: char, font: u32) -> Option<common::Scaled> {
221        self.fonts[&font].width_utf8(c)
222    }
223    fn height(&self, c: char, font: u32) -> Option<common::Scaled> {
224        self.fonts[&font].height_utf8(c)
225    }
226    fn depth(&self, c: char, font: u32) -> Option<common::Scaled> {
227        self.fonts[&font].depth_utf8(c)
228    }
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use boxworks::lang as bwl;
235    use boxworks::TextPreprocessor;
236
237    macro_rules! preprocessor_tests {
238        ( $namespace: ident, $tfm: ident, $( ( $name: ident, $input: expr, $want: expr, ), )+ ) => {
239            mod $namespace {
240                $(
241                    #[test]
242                    fn $name() {
243                        let tfm = super::$tfm;
244                        let input = $input;
245                        let want = $want;
246                        super::run_preprocessor_test(tfm, input, want)
247                    }
248                )+
249            }
250        };
251    }
252
253    const TFM_CMR10: &'static [u8] = include_bytes!("../../tfm/corpus/computer-modern/cmr10.tfm");
254
255    preprocessor_tests!(
256        cmr10,
257        TFM_CMR10,
258        (
259            basic,
260            "second",
261            r#"
262                chars("second", font=0)
263            "#,
264        ),
265        (
266            basic_with_space,
267            "sec ond",
268            r#"
269                chars("sec", font=0)
270                glue(3.33333pt, 1.66666pt, 1.11111pt)
271                chars("ond", font=0)
272            "#,
273        ),
274        (
275            kern_ao,
276            "AO",
277            r#"
278                chars("A", font=0)
279                kern(-0.27779pt)
280                chars("O", font=0)
281            "#,
282        ),
283        (
284            kern_av,
285            "AV",
286            r#"
287                chars("A", font=0)
288                kern(-1.11113pt)
289                chars("V", font=0)
290            "#,
291        ),
292        (
293            ligature_1,
294            "ff",
295            r#"
296                lig("\u{b}", "ff", font=0)
297            "#,
298        ),
299        (
300            ligature_2,
301            "ffi",
302            r#"
303                lig("\u{e}", "ffi", font=0)
304            "#,
305        ),
306    );
307
308    macro_rules! spacing_tests {
309        ( $( ( $name: ident, $input: expr, $want: expr, ), )+ ) => {
310            mod spacing {
311                $(
312                    #[test]
313                    fn $name() {
314                        let tfm = super::TFM_CMR10;
315                        let input = format!["{} a", $input];
316                        let want =  format![r#"
317                            chars("{}", font=0)
318                            {}
319                            chars("a", font=0)
320                        "#, $input, $want];
321                        super::run_preprocessor_test(tfm, &input, &want)
322                    }
323                )+
324            }
325        };
326    }
327
328    spacing_tests!(
329        // These tests are testing the default space factors in plain.tex.
330        (default_1, "a;", "glue(3.33333pt, 2.49998pt, 0.74074pt)",),
331        (default_2, "a,", "glue(3.33333pt, 2.08331pt, 0.88889pt)",),
332        (default_3, "a.", "glue(4.44444pt, 4.99997pt, 0.37036pt)",),
333        (default_4, "a:", "glue(4.44444pt, 3.33331pt, 0.55556pt)",),
334        // The next tests are for the adjust_space_factor function.
335        // The SF is adjusted based on both its current value and the SF
336        // of the next character. We first test 16 possible cases where
337        // current and next are in the following 4 classes: zero, small
338        // (less than 1000), normal (1000), large (greater than 1000).
339        (
340            adjust_space_factor_zero_zero,
341            "))",
342            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
343        ),
344        (
345            adjust_space_factor_zero_small,
346            ")A",
347            "glue(3.33333pt, 1.66498pt, 1.11221pt)",
348        ),
349        (
350            adjust_space_factor_zero_normal,
351            ")a",
352            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
353        ),
354        (
355            adjust_space_factor_zero_large,
356            ").",
357            "glue(4.44444pt, 4.99997pt, 0.37036pt)",
358        ),
359        (
360            adjust_space_factor_small_zero,
361            "A)",
362            "glue(3.33333pt, 1.66498pt, 1.11221pt)",
363        ),
364        (
365            adjust_space_factor_small_small,
366            "AA",
367            "glue(3.33333pt, 1.66498pt, 1.11221pt)",
368        ),
369        (
370            adjust_space_factor_small_normal,
371            "Aa",
372            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
373        ),
374        (
375            adjust_space_factor_small_large,
376            "A.",
377            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
378        ),
379        (
380            adjust_space_factor_normal_zero,
381            "a)",
382            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
383        ),
384        (
385            adjust_space_factor_normal_small,
386            "aA",
387            "glue(3.33333pt, 1.66498pt, 1.11221pt)",
388        ),
389        (
390            adjust_space_factor_normal_normal,
391            "aa",
392            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
393        ),
394        (
395            adjust_space_factor_normal_large,
396            "a.",
397            "glue(4.44444pt, 4.99997pt, 0.37036pt)",
398        ),
399        (
400            adjust_space_factor_large_zero,
401            ".)",
402            "glue(4.44444pt, 4.99997pt, 0.37036pt)",
403        ),
404        (
405            adjust_space_factor_large_small,
406            ".A",
407            "glue(3.33333pt, 1.66498pt, 1.11221pt)",
408        ),
409        (
410            adjust_space_factor_large_normal,
411            ".a",
412            "glue(3.33333pt, 1.66666pt, 1.11111pt)",
413        ),
414        (
415            adjust_space_factor_large_large,
416            "..",
417            "glue(4.44444pt, 4.99997pt, 0.37036pt)",
418        ),
419    );
420
421    const TFM_SMFEBSL: &'static [u8] = include_bytes!("../../tfm/corpus/ctan/smfebsl10-3.tfm");
422
423    preprocessor_tests!(
424        smfebsl,
425        TFM_SMFEBSL,
426        (
427            basic_with_space,
428            "sec ond",
429            r#"
430                chars("sec", font=0)
431                glue(4.78204pt, 2.39102pt, 1.19551pt)
432                chars("on", font=0)
433                kern(-0.49814pt)
434                chars("d", font=0)
435            "#,
436        ),
437        (
438            numbers_start_of_word,
439            "123B",
440            r##"
441                lig("$", "|", font=0)
442                chars("123", font=0)
443                lig("#", "", font=0)
444                chars("B", font=0)
445            "##,
446        ),
447        (
448            numbers_mid_word,
449            "A123B",
450            r##"
451                chars("A", font=0)
452                lig("$", "", font=0)
453                chars("123", font=0)
454                lig("#", "", font=0)
455                chars("B", font=0)
456            "##,
457        ),
458        /*
459        TODO: right boundary char
460        (
461            numbers_end_of_word,
462            "A123",
463            r##"
464                chars("A", font=0)
465                lig("$", "", font=0)
466                chars("123", font=0)
467                lig("#", "|", font=0)
468            "##,
469        ),
470        */
471    );
472
473    fn run_preprocessor_test(tfm_bytes: &[u8], input: &str, want: &str) {
474        let mut tfm_file = tfm::File::deserialize(tfm_bytes).0.unwrap();
475        let lig_kern_program =
476            tfm::ligkern::CompiledProgram::compile_from_tfm_file(&mut tfm_file).0;
477
478        let want = bwl::parse_horizontal_list(want).unwrap();
479
480        let mut tp: TextPreprocessorImpl = Default::default();
481        tp.register_font(0, &tfm_file, lig_kern_program);
482        tp.activate_font(0);
483        let mut got = vec![];
484        for word in input.split_inclusive(' ') {
485            tp.add_word(word.trim_matches(' '), &mut got);
486            if word.ends_with(" ") {
487                tp.add_space(&mut got);
488            }
489        }
490
491        assert_eq!(got, want);
492    }
493}