boxworks_hyphenate/lib.rs
1use boxworks::ds;
2
3#[derive(Default)]
4pub struct Hyphenator {
5 pub hyphenator: hyphenate::Hyphenator,
6 pub left_hyphen_min: i32,
7 pub right_hyphen_min: i32,
8}
9
10impl Hyphenator {
11 pub fn plain_tex_en_us() -> Self {
12 Self {
13 hyphenator: hyphenate::Hyphenator::plain_tex_en_us(),
14 left_hyphen_min: 2,
15 right_hyphen_min: 3,
16 }
17 }
18}
19
20impl boxworks::Hyphenator for Hyphenator {
21 fn hyphenate(&self, list: &mut Vec<ds::Horizontal>) {
22 let out = hyphenate_impl(self, list);
23 *list = out;
24 }
25}
26
27fn hyphenate_impl(hyphenater: &Hyphenator, list: &[ds::Horizontal]) -> Vec<ds::Horizontal> {
28 let lower_caser = hyphenate::AsciiLowerCaser::default();
29 let mut out = vec![];
30 let mut i = 0;
31 while let Some(elem) = list.get(i) {
32 i += 1;
33 out.push(elem.clone());
34 // Hyphenation starts only at glue nodes, as per TeX.2021.866.
35 if !matches!(elem, ds::Horizontal::Glue(_)) {
36 continue;
37 }
38
39 // TODO: implement \lefthyphenmin and \righthyphenmin,
40 // we assume they are 2 and 3 in the hyphenate package
41 // TODO: hyf_char needs to be read from somewhere using the font numbers
42 // in char and lig nodes.
43 let hyf_char = 3_i32;
44
45 // Find the place to start hyphenating
46 // TeX.2021.896
47 let hyphenation_font: Option<u32> = loop {
48 let Some(elem) = list.get(i) else { break None };
49 enum Action {
50 Start { font: u32 },
51 Continue,
52 // Equivalent to done1 in Knuth's TeX.
53 Abort,
54 }
55 use ds::Horizontal::*;
56 let action = match elem {
57 Char(char) => {
58 // The hyf_char logic runs in the label done2.
59 // TODO: this code assumes \uchyph=true (uppercase letters are hyphenated)
60 // and that \lccode has its PlainTeX values (has lower case character iff
61 // ASCII alphabetic). We should remove these assumptions by plumbing in some
62 // parameters.
63 if char.char.is_ascii_alphabetic() {
64 if hyf_char > 0 && hyf_char <= 255 {
65 Action::Start { font: char.font }
66 } else {
67 Action::Abort
68 }
69 } else {
70 Action::Continue
71 }
72 }
73 Ligature(ligature) => {
74 match ligature.original_chars.chars().next() {
75 None => Action::Continue,
76 Some(char) => {
77 // TODO: all of the TODOs in the Char arm.
78 if char.is_ascii_alphabetic() {
79 if hyf_char > 0 && hyf_char <= 255 {
80 Action::Start {
81 font: ligature.font,
82 }
83 } else {
84 Action::Abort
85 }
86 } else {
87 Action::Continue
88 }
89 }
90 }
91 }
92 Whatsit(whatsit) => {
93 // TeX.2021.1363 but given how we've architected the code, the logic in TeX.2021.1382
94 // (which changes the current language) should run here.
95 whatsit.hyphenation_hook();
96 Action::Continue
97 }
98 Kern(kern) => match kern.kind {
99 ds::KernKind::Normal => Action::Continue,
100 _ => Action::Abort,
101 },
102 _ => Action::Abort,
103 };
104 match action {
105 Action::Start { font } => {
106 break Some(font);
107 }
108 Action::Continue => {
109 i += 1;
110 out.push(elem.clone());
111 }
112 Action::Abort => {
113 i += 1;
114 out.push(elem.clone());
115 break None;
116 }
117 }
118 };
119 let Some(hyphenation_font) = hyphenation_font else {
120 continue;
121 };
122 // It's still possible we won't hyphenate based on the node that ends the
123 // string of characters. So we save this `i` value here; if hyphenation is skipped
124 // we set `i` back to this.
125 let hyphenation_start_i = i;
126
127 let mut s = String::new();
128 // Accumulate the word to be hyphenated.
129 // TeX.2021.897
130 while let Some(elem) = list.get(i) {
131 use ds::Horizontal::*;
132 match elem {
133 Char(char) => {
134 if char.font != hyphenation_font {
135 break;
136 }
137 // TODO: plumb in \lccode and change this check.
138 if !char.char.is_ascii_alphabetic() {
139 break;
140 }
141 if s.len() + char.char.len_utf8() >= 64 {
142 // TeX only hyphenates words up to 64 bytes.
143 // TODO: this check is not quite right: unicode values in the range [128, 255)
144 // should count as 1 only.
145 break;
146 }
147 s.push(char.char);
148 }
149 Ligature(ligature) => {
150 // TeX.2021.898
151 if ligature.font != hyphenation_font {
152 break;
153 }
154 if !ligature
155 .original_chars
156 .chars()
157 .all(|c| c.is_ascii_alphabetic())
158 {
159 break;
160 }
161 if s.len() + ligature.original_chars.len() >= 64 {
162 // TeX only hyphenates words up to 64 bytes.
163 // TODO: this check is not quite right: unicode values in the range [128, 255)
164 // should count as 1 only.
165 break;
166 }
167 s.push_str(&ligature.original_chars);
168 }
169 Kern(kern) => match kern.kind {
170 ds::KernKind::Normal => {
171 // TODO: set up the lig/kern program correctly.
172 }
173 _ => break,
174 },
175 _ => break,
176 }
177 // Consume the node whose characters have just been placed in s (or the normal kern).
178 i += 1;
179 }
180 // The first char node that triggered the word search will have been put in s.
181 assert!(!s.is_empty());
182
183 // Check if the word can be hyphenated based on the terminating node.
184 // TeX.2021.899
185 // We use a different index to iterate as all of the elements here still need to be
186 // copied to the output list.
187 let mut j = i;
188 let should_hyphenate = loop {
189 let Some(elem) = list.get(j) else { break true };
190 use ds::Horizontal::*;
191 match elem {
192 Char(_) | Ligature(_) => {
193 // This can happen if the font is different, or the 64 byte limit is already
194 // reached.
195 }
196 Kern(kern) => match kern.kind {
197 ds::KernKind::Normal => {
198 // TODO: set up the lig/kern program correctly.
199 }
200 _ => break true,
201 },
202 Whatsit(_) | Glue(_) | Penalty(_) | Insertion(_) | Adjust(_) | Mark(_) => {
203 break true;
204 }
205 HBox(_) | VBox(_) | Rule(_) | Discretionary(_) | Math(_) => {
206 // done1 in Knuth's TeX.f
207 break false;
208 }
209 }
210 j += 1;
211 };
212 if !should_hyphenate {
213 i = hyphenation_start_i;
214 continue;
215 }
216
217 let l = s.chars().count();
218 // TeX.2021.1200
219 let left_hyphen_min = if hyphenater.left_hyphen_min < 1 {
220 1
221 } else {
222 hyphenater.left_hyphen_min as usize
223 };
224 let right_hyphen_min = if hyphenater.right_hyphen_min < 1 {
225 1
226 } else {
227 hyphenater.right_hyphen_min as usize
228 };
229
230 let mut indices = hyphenater.hyphenator.calculate_indices(&lower_caser, &s);
231 let mut next_or = indices.next();
232
233 let mut j = hyphenation_start_i;
234 let mut chars_pushed = 0;
235 while j < i {
236 if let Some(next) = next_or {
237 if next == chars_pushed
238 && next >= left_hyphen_min
239 && next <= l.saturating_sub(right_hyphen_min)
240 {
241 let mut replace_count = 0_u32;
242 while j + (replace_count as usize) < i
243 && matches!(&list[j + (replace_count as usize)], ds::Horizontal::Kern(_))
244 {
245 replace_count += 1;
246 }
247 let mut pre_break = vec![];
248 let put_back = if replace_count > 0 {
249 let last_char = out.pop().expect("char character was just written");
250 pre_break.push(
251 last_char
252 .clone()
253 .try_into()
254 .expect("this must be a char item"),
255 );
256 replace_count += 1;
257 Some(last_char)
258 } else {
259 None
260 };
261 pre_break.push(ds::DiscretionaryElem::Char(ds::Char {
262 char: '-',
263 font: hyphenation_font,
264 }));
265 out.push(ds::Horizontal::Discretionary(ds::Discretionary {
266 pre_break,
267 post_break: vec![],
268 replace_count,
269 }));
270 if let Some(put_back) = put_back {
271 out.push(put_back);
272 }
273 }
274 if next <= chars_pushed {
275 next_or = indices.next();
276 }
277 }
278 let elem = &list[j];
279 out.push(elem.clone());
280 j += 1;
281 use ds::Horizontal::*;
282 chars_pushed += match elem {
283 Char(_) => 1,
284 Ligature(ligature) => ligature.original_chars.chars().count(),
285 Kern(kern) => match kern.kind {
286 ds::KernKind::Normal => 0,
287 _ => panic!("unexpected node in hyphenated word"),
288 },
289 _ => panic!("unexpected node in hyphenated word"),
290 };
291 }
292 }
293 out
294}
295
296#[cfg(test)]
297mod tests {
298 use std::collections::HashMap;
299 use std::path::PathBuf;
300
301 use super::*;
302 use boxworks::TextPreprocessor;
303 use boxworks_testing::assert_box_eq;
304 use boxworks_text as bwt;
305
306 const TFM_CMR10: &'static [u8] = include_bytes!("../../tfm/corpus/computer-modern/cmr10.tfm");
307
308 fn run_hyphenation_test(input: &str, lig_kern_program: &str, want: &str) {
309 let unhyphenated: String = input.chars().filter(|c| *c != '-').collect();
310
311 // TeX does not hyphenate the first word of a paragraph so we need to put
312 // another word before the word of interest
313 let tex_input = format!["x {unhyphenated}"];
314
315 let mut tfm_file = tfm::File::deserialize(TFM_CMR10).0.unwrap();
316 {
317 let (p, e) = tfm::ligkern::lang::Program::parse_compact(lig_kern_program).unwrap();
318 tfm_file.replace_lig_kern_program(p, e);
319 }
320
321 if std::env::var("TEXCRAFT_VERIFY").unwrap_or("".to_string()) == "tex" {
322 let tfm_bytes = tfm_file.serialize();
323 let mut auxiliary_files: HashMap<PathBuf, Vec<u8>> = Default::default();
324 auxiliary_files.insert("specialFont.tfm".into(), tfm_bytes);
325
326 let preamble = format![
327 r"
328 \font \customFont specialFont
329
330 \hyphenation{{{input}}}
331 \lefthyphenmin=0
332 \righthyphenmin=0
333
334 \customFont
335 "
336 ];
337 let tex_engine = boxworks::tex::new_tex_engine_binary("tex".to_string()).unwrap();
338 let (_, tex_got) = boxworks::tex::build_horizontal_lists(
339 tex_engine.as_ref(),
340 &auxiliary_files,
341 &preamble,
342 &mut vec![tex_input.clone()].iter(),
343 /*hyphenate=*/ true,
344 );
345 let tex_got: Vec<boxworks::ds::Horizontal> =
346 tex_got[0].list[2..].iter().cloned().collect();
347
348 assert_box_eq!(want, tex_got);
349 }
350
351 /*
352 let lig_kern_program =
353 tfm::ligkern::CompiledProgram::compile_from_tfm_file(&mut tfm_file).0;
354 let mut tp: bwt::TextPreprocessorImpl = Default::default();
355 tp.register_font(0, &tfm_file, lig_kern_program);
356 tp.activate_font(0);
357 let mut list = vec![];
358 for word in tex_input.split_ascii_whitespace() {
359 tp.add_word(word.trim_matches(' '), &mut list);
360 tp.add_space(&mut list);
361 }
362 list.pop();
363
364 let mut font_repo: bwt::TfmFontRepo = Default::default();
365 font_repo.register_font(0, tfm_file);
366
367 let mut hyphenator = Hyphenator::plain_tex_en_us();
368 hyphenator.hyphenator.insert_exception(input);
369 hyphenator.left_hyphen_min = 1;
370 hyphenator.right_hyphen_min = 1;
371 {
372 use boxworks::Hyphenator;
373 hyphenator.hyphenate(&mut list)
374 }
375
376 let tex_got: Vec<boxworks::ds::Horizontal> = list[2..].iter().cloned().collect();
377 assert_box_eq!(want, tex_got);
378 */
379 }
380
381 macro_rules! hyphenation_tests {
382 ( $( { $name: ident, $input: expr, $lig_kern_program: expr, $want: expr }, )* ) => {
383 $(
384 #[test]
385 fn $name() {
386 run_hyphenation_test($input, $lig_kern_program, $want);
387 }
388 )*
389 };
390 }
391
392 hyphenation_tests![
393 {
394 most_simple_case, "a-b", "",
395 r#"
396 chars("a")
397 disc(
398 pre_break=[
399 chars("-")
400 ],
401 )
402 chars("b")
403 "#
404 },
405 {
406 lig_with_hyphen, "a-b",
407 "
408 a- -> ax-^
409 ",
410 r#"
411 disc(
412 pre_break=[
413 chars("ax")
414 chars("-")
415 ],
416 replace_count=1,
417 )
418 chars("a")
419 chars("b")
420 "#
421 },
422 {
423 lig_with_hyphen_and_letters, "a-b",
424 "
425 a- -> ax-^
426 ab -> ac^_
427 ",
428 r#"
429 disc(
430 pre_break=[
431 chars("ax")
432 chars("-")
433 ],
434 post_break=[
435 chars("b")
436 ],
437 replace_count=2,
438 )
439 chars("a")
440 lig("c", "b")
441 "#
442 },
443 {
444 left_boundary_char, "a-b",
445 "
446 |b -> |c^_
447 ",
448 r#"
449 chars("a")
450 disc(
451 pre_break=[
452 chars("-")
453 ],
454 post_break=[
455 chars("c")
456 ],
457 replace_count=1,
458 )
459 chars("b")
460 "#
461 },
462 {
463 right_boundary_char, "a-b",
464 "
465 -| -> -c^|
466 ",
467 r#"
468 chars("a")
469 disc(
470 pre_break=[
471 chars("-c")
472 ],
473 )
474 chars("b")
475 "#
476 },
477 {
478 big_lig_1, "a-bc",
479 "
480 ab -> _x^_
481 xc -> _y^_
482 ",
483 r#"
484 disc(
485 pre_break=[
486 chars("a")
487 chars("-")
488 ],
489 post_break=[
490 chars("b")
491 chars("c")
492 ],
493 replace_count=1,
494 )
495 lig("y", "abc")
496 "#
497 },
498 {
499 big_lig_2, "a-bc",
500 "
501 ab -> _x^_
502 xc -> _y^_
503 bc -> _z^_
504 ",
505 r#"
506 disc(
507 pre_break=[
508 chars("a")
509 chars("-")
510 ],
511 post_break=[
512 chars("z")
513 ],
514 replace_count=1,
515 )
516 lig("y", "abc")
517 "#
518 },
519 {
520 big_lig_3, "ab-c",
521 "
522 ab -> _x^_
523 xc -> _y^_
524 ",
525 r#"
526 disc(
527 pre_break=[
528 chars("x")
529 chars("-")
530 ],
531 post_break=[
532 chars("c")
533 ],
534 replace_count=1,
535 )
536 lig("y", "abc")
537 "#
538 },
539 {
540 big_lig_with_hyphen, "ab-c",
541 "
542 ab -> ax^_
543 x- -> xy^-
544 ",
545 r#"
546 disc(
547 pre_break=[
548 chars("axy")
549 chars("-")
550 ],
551 replace_count=2,
552 )
553 chars("a")
554 lig("x", "b")
555 chars("c")
556 "#
557 },
558 {
559 empty_lig_before, "a-b",
560 "
561 ab -> ax^b
562 ",
563 r#"
564 disc(
565 pre_break=[
566 chars("a")
567 chars("-")
568 ],
569 replace_count=2,
570 )
571 chars("a")
572 lig("x", "")
573 chars("b")
574 "#
575 },
576 {
577 same_kern, "a-b",
578 "
579 ab -> a[100]b
580 a- -> a[100]-
581 ",
582 r#"
583 disc(
584 pre_break=[
585 chars("a")
586 kern(0.00095pt)
587 chars("-")
588 ],
589 replace_count=2,
590 )
591 chars("a")
592 kern(0.00095pt)
593 chars("b")
594 "#
595 },
596 {
597 synchronization_1, "a-bcdefgh",
598 "
599 ab -> _x^_
600 bc -> _y^_
601 cd -> _z^_
602 de -> _w^_
603 ef -> _v^_
604 ",
605 r#"
606 disc(
607 pre_break=[
608 chars("a-")
609 ],
610 post_break=[
611 chars("ywf")
612 ],
613 replace_count=3,
614 )
615 lig("x", "ab")
616 lig("z", "cd")
617 lig("v", "ef")
618 # synchronization point
619 chars("gh", font=0)
620 "#
621 },
622 {
623 synchronization_2, "a-bcd-ef-gh",
624 "
625 ab -> _x^_
626 bc -> _y^_
627 cd -> _z^_
628 de -> _w^_
629 ef -> _v^_
630 ",
631 r#"
632 disc(
633 pre_break=[
634 chars("a-")
635 ],
636 post_break=[
637 chars("ywf")
638 ],
639 replace_count=3,
640 )
641 lig("x", "ab")
642 lig("z", "cd")
643 # the hyphen here is skipped
644 lig("v", "ef")
645 # synchronization point
646 disc(
647 pre_break=[
648 chars("-")
649 ],
650 post_break=[
651 ],
652 )
653 chars("gh", font=0)
654 "#
655 },
656 {
657 synchronization_4, "a-bcde",
658 "
659 ab -> _x^_
660 bc -> _y^_
661 xc -> _y^_
662 yd -> yzd^
663 ",
664 r#"
665 disc(
666 pre_break=[
667 chars("a-")
668 ],
669 post_break=[
670 # these are duplicated from the main list.
671 # we could have nothing here and replace_count=0
672 chars("yz")
673 ],
674 replace_count=2,
675 )
676 lig("y", "abc")
677 lig("z", "")
678 chars("de", font=0)
679 "#
680 },
681 ];
682}