boxworks_hyphenate/
lib.rs

1use boxworks::ds;
2
3#[derive(Default)]
4pub struct Hyphenator {
5    pub hyphenator: hyphenate::Hyphenator,
6    pub left_hyphen_min: i32,
7    pub right_hyphen_min: i32,
8}
9
10impl Hyphenator {
11    pub fn plain_tex_en_us() -> Self {
12        Self {
13            hyphenator: hyphenate::Hyphenator::plain_tex_en_us(),
14            left_hyphen_min: 2,
15            right_hyphen_min: 3,
16        }
17    }
18}
19
20impl boxworks::Hyphenator for Hyphenator {
21    fn hyphenate(&self, list: &mut Vec<ds::Horizontal>) {
22        let out = hyphenate_impl(self, list);
23        *list = out;
24    }
25}
26
27fn hyphenate_impl(hyphenater: &Hyphenator, list: &[ds::Horizontal]) -> Vec<ds::Horizontal> {
28    let lower_caser = hyphenate::AsciiLowerCaser::default();
29    let mut out = vec![];
30    let mut i = 0;
31    while let Some(elem) = list.get(i) {
32        i += 1;
33        out.push(elem.clone());
34        // Hyphenation starts only at glue nodes, as per TeX.2021.866.
35        if !matches!(elem, ds::Horizontal::Glue(_)) {
36            continue;
37        }
38
39        // TODO: implement \lefthyphenmin and \righthyphenmin,
40        // we assume they are 2 and 3 in the hyphenate package
41        // TODO: hyf_char needs to be read from somewhere using the font numbers
42        // in char and lig nodes.
43        let hyf_char = 3_i32;
44
45        // Find the place to start hyphenating
46        // TeX.2021.896
47        let hyphenation_font: Option<u32> = loop {
48            let Some(elem) = list.get(i) else { break None };
49            enum Action {
50                Start { font: u32 },
51                Continue,
52                // Equivalent to done1 in Knuth's TeX.
53                Abort,
54            }
55            use ds::Horizontal::*;
56            let action = match elem {
57                Char(char) => {
58                    // The hyf_char logic runs in the label done2.
59                    // TODO: this code assumes \uchyph=true (uppercase letters are hyphenated)
60                    // and that \lccode has its PlainTeX values (has lower case character iff
61                    // ASCII alphabetic). We should remove these assumptions by plumbing in some
62                    // parameters.
63                    if char.char.is_ascii_alphabetic() {
64                        if hyf_char > 0 && hyf_char <= 255 {
65                            Action::Start { font: char.font }
66                        } else {
67                            Action::Abort
68                        }
69                    } else {
70                        Action::Continue
71                    }
72                }
73                Ligature(ligature) => {
74                    match ligature.original_chars.chars().next() {
75                        None => Action::Continue,
76                        Some(char) => {
77                            // TODO: all of the TODOs in the Char arm.
78                            if char.is_ascii_alphabetic() {
79                                if hyf_char > 0 && hyf_char <= 255 {
80                                    Action::Start {
81                                        font: ligature.font,
82                                    }
83                                } else {
84                                    Action::Abort
85                                }
86                            } else {
87                                Action::Continue
88                            }
89                        }
90                    }
91                }
92                Whatsit(whatsit) => {
93                    // TeX.2021.1363 but given how we've architected the code, the logic in TeX.2021.1382
94                    // (which changes the current language) should run here.
95                    whatsit.hyphenation_hook();
96                    Action::Continue
97                }
98                Kern(kern) => match kern.kind {
99                    ds::KernKind::Normal => Action::Continue,
100                    _ => Action::Abort,
101                },
102                _ => Action::Abort,
103            };
104            match action {
105                Action::Start { font } => {
106                    break Some(font);
107                }
108                Action::Continue => {
109                    i += 1;
110                    out.push(elem.clone());
111                }
112                Action::Abort => {
113                    i += 1;
114                    out.push(elem.clone());
115                    break None;
116                }
117            }
118        };
119        let Some(hyphenation_font) = hyphenation_font else {
120            continue;
121        };
122        // It's still possible we won't hyphenate based on the node that ends the
123        // string of characters. So we save this `i` value here; if hyphenation is skipped
124        // we set `i` back to this.
125        let hyphenation_start_i = i;
126
127        let mut s = String::new();
128        // Accumulate the word to be hyphenated.
129        // TeX.2021.897
130        while let Some(elem) = list.get(i) {
131            use ds::Horizontal::*;
132            match elem {
133                Char(char) => {
134                    if char.font != hyphenation_font {
135                        break;
136                    }
137                    // TODO: plumb in \lccode and change this check.
138                    if !char.char.is_ascii_alphabetic() {
139                        break;
140                    }
141                    if s.len() + char.char.len_utf8() >= 64 {
142                        // TeX only hyphenates words up to 64 bytes.
143                        // TODO: this check is not quite right: unicode values in the range [128, 255)
144                        // should count as 1 only.
145                        break;
146                    }
147                    s.push(char.char);
148                }
149                Ligature(ligature) => {
150                    // TeX.2021.898
151                    if ligature.font != hyphenation_font {
152                        break;
153                    }
154                    if !ligature
155                        .original_chars
156                        .chars()
157                        .all(|c| c.is_ascii_alphabetic())
158                    {
159                        break;
160                    }
161                    if s.len() + ligature.original_chars.len() >= 64 {
162                        // TeX only hyphenates words up to 64 bytes.
163                        // TODO: this check is not quite right: unicode values in the range [128, 255)
164                        // should count as 1 only.
165                        break;
166                    }
167                    s.push_str(&ligature.original_chars);
168                }
169                Kern(kern) => match kern.kind {
170                    ds::KernKind::Normal => {
171                        // TODO: set up the lig/kern program correctly.
172                    }
173                    _ => break,
174                },
175                _ => break,
176            }
177            // Consume the node whose characters have just been placed in s (or the normal kern).
178            i += 1;
179        }
180        // The first char node that triggered the word search will have been put in s.
181        assert!(!s.is_empty());
182
183        // Check if the word can be hyphenated based on the terminating node.
184        // TeX.2021.899
185        // We use a different index to iterate as all of the elements here still need to be
186        // copied to the output list.
187        let mut j = i;
188        let should_hyphenate = loop {
189            let Some(elem) = list.get(j) else { break true };
190            use ds::Horizontal::*;
191            match elem {
192                Char(_) | Ligature(_) => {
193                    // This can happen if the font is different, or the 64 byte limit is already
194                    // reached.
195                }
196                Kern(kern) => match kern.kind {
197                    ds::KernKind::Normal => {
198                        // TODO: set up the lig/kern program correctly.
199                    }
200                    _ => break true,
201                },
202                Whatsit(_) | Glue(_) | Penalty(_) | Insertion(_) | Adjust(_) | Mark(_) => {
203                    break true;
204                }
205                HBox(_) | VBox(_) | Rule(_) | Discretionary(_) | Math(_) => {
206                    // done1 in Knuth's TeX.f
207                    break false;
208                }
209            }
210            j += 1;
211        };
212        if !should_hyphenate {
213            i = hyphenation_start_i;
214            continue;
215        }
216
217        let l = s.chars().count();
218        // TeX.2021.1200
219        let left_hyphen_min = if hyphenater.left_hyphen_min < 1 {
220            1
221        } else {
222            hyphenater.left_hyphen_min as usize
223        };
224        let right_hyphen_min = if hyphenater.right_hyphen_min < 1 {
225            1
226        } else {
227            hyphenater.right_hyphen_min as usize
228        };
229
230        let mut indices = hyphenater.hyphenator.calculate_indices(&lower_caser, &s);
231        let mut next_or = indices.next();
232
233        let mut j = hyphenation_start_i;
234        let mut chars_pushed = 0;
235        while j < i {
236            if let Some(next) = next_or {
237                if next == chars_pushed
238                    && next >= left_hyphen_min
239                    && next <= l.saturating_sub(right_hyphen_min)
240                {
241                    let mut replace_count = 0_u32;
242                    while j + (replace_count as usize) < i
243                        && matches!(&list[j + (replace_count as usize)], ds::Horizontal::Kern(_))
244                    {
245                        replace_count += 1;
246                    }
247                    let mut pre_break = vec![];
248                    let put_back = if replace_count > 0 {
249                        let last_char = out.pop().expect("char character was just written");
250                        pre_break.push(
251                            last_char
252                                .clone()
253                                .try_into()
254                                .expect("this must be a char item"),
255                        );
256                        replace_count += 1;
257                        Some(last_char)
258                    } else {
259                        None
260                    };
261                    pre_break.push(ds::DiscretionaryElem::Char(ds::Char {
262                        char: '-',
263                        font: hyphenation_font,
264                    }));
265                    out.push(ds::Horizontal::Discretionary(ds::Discretionary {
266                        pre_break,
267                        post_break: vec![],
268                        replace_count,
269                    }));
270                    if let Some(put_back) = put_back {
271                        out.push(put_back);
272                    }
273                }
274                if next <= chars_pushed {
275                    next_or = indices.next();
276                }
277            }
278            let elem = &list[j];
279            out.push(elem.clone());
280            j += 1;
281            use ds::Horizontal::*;
282            chars_pushed += match elem {
283                Char(_) => 1,
284                Ligature(ligature) => ligature.original_chars.chars().count(),
285                Kern(kern) => match kern.kind {
286                    ds::KernKind::Normal => 0,
287                    _ => panic!("unexpected node in hyphenated word"),
288                },
289                _ => panic!("unexpected node in hyphenated word"),
290            };
291        }
292    }
293    out
294}
295
296#[cfg(test)]
297mod tests {
298    use std::collections::HashMap;
299    use std::path::PathBuf;
300
301    use super::*;
302    use boxworks::TextPreprocessor;
303    use boxworks_testing::assert_box_eq;
304    use boxworks_text as bwt;
305
306    const TFM_CMR10: &'static [u8] = include_bytes!("../../tfm/corpus/computer-modern/cmr10.tfm");
307
308    fn run_hyphenation_test(input: &str, lig_kern_program: &str, want: &str) {
309        let unhyphenated: String = input.chars().filter(|c| *c != '-').collect();
310
311        // TeX does not hyphenate the first word of a paragraph so we need to put
312        // another word before the word of interest
313        let tex_input = format!["x {unhyphenated}"];
314
315        let mut tfm_file = tfm::File::deserialize(TFM_CMR10).0.unwrap();
316        {
317            let (p, e) = tfm::ligkern::lang::Program::parse_compact(lig_kern_program).unwrap();
318            tfm_file.replace_lig_kern_program(p, e);
319        }
320
321        if std::env::var("TEXCRAFT_VERIFY").unwrap_or("".to_string()) == "tex" {
322            let tfm_bytes = tfm_file.serialize();
323            let mut auxiliary_files: HashMap<PathBuf, Vec<u8>> = Default::default();
324            auxiliary_files.insert("specialFont.tfm".into(), tfm_bytes);
325
326            let preamble = format![
327                r"
328                    \font \customFont specialFont
329                    
330                    \hyphenation{{{input}}}
331                    \lefthyphenmin=0
332                    \righthyphenmin=0
333                    
334                    \customFont
335                "
336            ];
337            let tex_engine = boxworks::tex::new_tex_engine_binary("tex".to_string()).unwrap();
338            let (_, tex_got) = boxworks::tex::build_horizontal_lists(
339                tex_engine.as_ref(),
340                &auxiliary_files,
341                &preamble,
342                &mut vec![tex_input.clone()].iter(),
343                /*hyphenate=*/ true,
344            );
345            let tex_got: Vec<boxworks::ds::Horizontal> =
346                tex_got[0].list[2..].iter().cloned().collect();
347
348            assert_box_eq!(want, tex_got);
349        }
350
351        /*
352        let lig_kern_program =
353            tfm::ligkern::CompiledProgram::compile_from_tfm_file(&mut tfm_file).0;
354        let mut tp: bwt::TextPreprocessorImpl = Default::default();
355        tp.register_font(0, &tfm_file, lig_kern_program);
356        tp.activate_font(0);
357        let mut list = vec![];
358        for word in tex_input.split_ascii_whitespace() {
359            tp.add_word(word.trim_matches(' '), &mut list);
360            tp.add_space(&mut list);
361        }
362        list.pop();
363
364        let mut font_repo: bwt::TfmFontRepo = Default::default();
365        font_repo.register_font(0, tfm_file);
366
367        let mut hyphenator = Hyphenator::plain_tex_en_us();
368        hyphenator.hyphenator.insert_exception(input);
369        hyphenator.left_hyphen_min = 1;
370        hyphenator.right_hyphen_min = 1;
371        {
372            use boxworks::Hyphenator;
373            hyphenator.hyphenate(&mut list)
374        }
375
376        let tex_got: Vec<boxworks::ds::Horizontal> = list[2..].iter().cloned().collect();
377        assert_box_eq!(want, tex_got);
378        */
379    }
380
381    macro_rules! hyphenation_tests {
382        ( $( { $name: ident, $input: expr, $lig_kern_program: expr, $want: expr }, )* ) => {
383            $(
384                #[test]
385                fn $name() {
386                    run_hyphenation_test($input, $lig_kern_program, $want);
387                }
388            )*
389        };
390    }
391
392    hyphenation_tests![
393        {
394            most_simple_case, "a-b", "",
395            r#"
396                chars("a")
397                disc(
398                  pre_break=[
399                    chars("-")
400                  ],
401                )
402                chars("b")
403            "#
404        },
405        {
406            lig_with_hyphen, "a-b",
407            "
408                a- -> ax-^
409            ",
410            r#"
411                disc(
412                  pre_break=[
413                    chars("ax")
414                    chars("-")
415                  ],
416                  replace_count=1,
417                )
418                chars("a")
419                chars("b")
420            "#
421        },
422        {
423            lig_with_hyphen_and_letters, "a-b",
424            "
425                a- -> ax-^
426                ab -> ac^_
427            ",
428            r#"
429                disc(
430                  pre_break=[
431                    chars("ax")
432                    chars("-")
433                  ],
434                  post_break=[
435                    chars("b")
436                  ],
437                  replace_count=2,
438                )
439                chars("a")
440                lig("c", "b")
441            "#
442        },
443        {
444            left_boundary_char, "a-b",
445            "
446                |b -> |c^_
447            ",
448            r#"
449                chars("a")
450                disc(
451                  pre_break=[
452                    chars("-")
453                  ],
454                  post_break=[
455                    chars("c")
456                  ],
457                  replace_count=1,
458                )
459                chars("b")
460            "#
461        },
462        {
463            right_boundary_char, "a-b",
464            "
465                -| -> -c^|
466            ",
467            r#"
468                chars("a")
469                disc(
470                  pre_break=[
471                    chars("-c")
472                  ],
473                )
474                chars("b")
475            "#
476        },
477        {
478            big_lig_1, "a-bc",
479            "
480                ab -> _x^_
481                xc -> _y^_
482            ",
483            r#"
484                disc(
485                  pre_break=[
486                    chars("a")
487                    chars("-")
488                  ],
489                  post_break=[
490                    chars("b")
491                    chars("c")
492                  ],
493                  replace_count=1,
494                )
495                lig("y", "abc")
496            "#
497        },
498        {
499            big_lig_2, "a-bc",
500            "
501                ab -> _x^_
502                xc -> _y^_
503                bc -> _z^_
504            ",
505            r#"
506                disc(
507                  pre_break=[
508                    chars("a")
509                    chars("-")
510                  ],
511                  post_break=[
512                    chars("z")
513                  ],
514                  replace_count=1,
515                )
516                lig("y", "abc")
517            "#
518        },
519        {
520            big_lig_3, "ab-c",
521            "
522                ab -> _x^_
523                xc -> _y^_
524            ",
525            r#"
526                disc(
527                  pre_break=[
528                    chars("x")
529                    chars("-")
530                  ],
531                  post_break=[
532                    chars("c")
533                  ],
534                  replace_count=1,
535                )
536                lig("y", "abc")
537            "#
538        },
539        {
540            big_lig_with_hyphen, "ab-c",
541            "
542                ab -> ax^_
543                x- -> xy^-
544            ",
545            r#"
546                disc(
547                  pre_break=[
548                    chars("axy")
549                    chars("-")
550                  ],
551                  replace_count=2,
552                )
553                chars("a")
554                lig("x", "b")
555                chars("c")
556            "#
557        },
558        {
559            empty_lig_before, "a-b",
560            "
561                ab -> ax^b
562            ",
563            r#"
564                disc(
565                  pre_break=[
566                    chars("a")
567                    chars("-")
568                  ],
569                  replace_count=2,
570                )
571                chars("a")
572                lig("x", "")
573                chars("b")
574            "#
575        },
576        {
577            same_kern, "a-b",
578            "
579                ab -> a[100]b
580                a- -> a[100]-
581            ",
582            r#"
583                disc(
584                  pre_break=[
585                    chars("a")
586                    kern(0.00095pt)
587                    chars("-")
588                  ],
589                  replace_count=2,
590                )
591                chars("a")
592                kern(0.00095pt)
593                chars("b")
594            "#
595        },
596        {
597            synchronization_1, "a-bcdefgh",
598            "
599                ab -> _x^_
600                bc -> _y^_
601                cd -> _z^_
602                de -> _w^_
603                ef -> _v^_
604            ",
605            r#"
606                disc(
607                  pre_break=[
608                    chars("a-")
609                  ],
610                  post_break=[
611                    chars("ywf")
612                  ],
613                  replace_count=3,
614                )
615                lig("x", "ab")
616                lig("z", "cd")
617                lig("v", "ef")
618                # synchronization point
619                chars("gh", font=0)
620            "#
621        },
622        {
623            synchronization_2, "a-bcd-ef-gh",
624            "
625                ab -> _x^_
626                bc -> _y^_
627                cd -> _z^_
628                de -> _w^_
629                ef -> _v^_
630            ",
631            r#"
632                disc(
633                  pre_break=[
634                    chars("a-")
635                  ],
636                  post_break=[
637                    chars("ywf")
638                  ],
639                  replace_count=3,
640                )
641                lig("x", "ab")
642                lig("z", "cd")
643                # the hyphen here is skipped
644                lig("v", "ef")
645                # synchronization point
646                disc(
647                  pre_break=[
648                    chars("-")
649                  ],
650                  post_break=[
651                  ],
652                )
653                chars("gh", font=0)
654            "#
655        },
656        {
657            synchronization_4, "a-bcde",
658            "
659                ab -> _x^_
660                bc -> _y^_
661                xc -> _y^_
662                yd -> yzd^
663            ",
664            r#"
665                disc(
666                  pre_break=[
667                    chars("a-")
668                  ],
669                  post_break=[
670                    # these are duplicated from the main list.
671                    # we could have nothing here and replace_count=0
672                    chars("yz")
673                  ],
674                  replace_count=2,
675                )
676                lig("y", "abc")
677                lig("z", "")
678                chars("de", font=0)
679            "#
680        },
681    ];
682}