hyphenate/
lib.rs

1//! Implementation of TeX's hyphenation algorithm.
2//!
3//! The main entry point is [`Hyphenator`], which can be constructed with
4//! plain TeX's built-in English patterns using [`Hyphenator::plain_tex_en_us`].
5
6/// Hyphenates words using TeX's pattern-matching algorithm (Knuth-Liang).
7///
8/// Construct with [`Hyphenator::plain_tex_en_us`] for the standard plain TeX
9/// English patterns, or load custom patterns with [`Hyphenator::load_patterns`].
10#[derive(Default)]
11pub struct Hyphenator {
12    data: Vec<u8>,
13    patterns: trie::Trie,
14}
15
16/// Implementations of this trait can get the lower case character of a character.
17pub trait LowerCaser {
18    /// Return the lower case character of the provided character, of [`None`] if the character
19    /// does not have a lower case character.
20    fn to_lower_case(&self, c: char) -> Option<char>;
21}
22
23/// The ASCII implementation of [`LowerCaser`] returns the lower case letter for all ASCII
24/// alphabetic characters, and [`None`] for all other characters.
25#[derive(Default)]
26pub struct AsciiLowerCaser {}
27
28impl LowerCaser for AsciiLowerCaser {
29    fn to_lower_case(&self, c: char) -> Option<char> {
30        if c.is_ascii_alphabetic() {
31            Some(c.to_ascii_lowercase())
32        } else {
33            None
34        }
35    }
36}
37
38impl Hyphenator {
39    /// Construct a hyphenator loaded with plain TeX's English (US) patterns and exceptions.
40    pub fn plain_tex_en_us() -> Self {
41        let patterns = include_str!("plain_tex_patterns.txt");
42        let exceptions = include_str!("plain_tex_exceptions.txt");
43        let mut h: Self = Default::default();
44        h.load_patterns(patterns);
45        h.insert_exceptions(exceptions);
46        h
47    }
48    /// Load hyphenation patterns from a whitespace-separated string in TeX pattern format.
49    pub fn load_patterns(&mut self, patterns: &str) {
50        // TeX.2021.961 and onwards.
51        let mut empty_value: Option<trie::Value> = None;
52        for pattern in patterns.split_whitespace() {
53            let mut vertex = self.patterns.root();
54            let mut value = &mut empty_value;
55            if pattern.starts_with('.') {
56                (vertex, value) = self.patterns.next(vertex, trie::Edge::StartOfWord);
57            }
58            let data_start = self.data.len();
59            enum State {
60                // The payload is the number of characters before this character
61                // that were not assigned a score. If we write out a score for this
62                // character, we will first write out zero scores for the other
63                // characters. This is a serialization optimization.
64                AfterChar(usize),
65                AfterScore,
66            }
67            let mut state = State::AfterChar(0);
68            for c in pattern.chars() {
69                match c {
70                    '0'..='9' => {
71                        match state {
72                            State::AfterChar(mut n) => {
73                                while n > 0 {
74                                    let code = (n + 10).try_into().unwrap_or(u8::MAX);
75                                    self.data.push(code);
76                                    n -= (code - 10) as usize;
77                                }
78                            }
79                            State::AfterScore => {
80                                // In the case of two consecutive scores (e.g. a123b)
81                                // it seems the last one wins. See. TeX.2021.962.
82                                self.data.pop();
83                            }
84                        }
85                        self.data
86                            .push((c as u32 - '0' as u32).try_into().expect("digits are <= 9"));
87                        state = State::AfterScore;
88                    }
89                    '.' => {
90                        // Already handled above.
91                        continue;
92                    }
93                    _ => {
94                        // TODO: we should error if it's not a valid pattern like
95                        // // the `help1` in TeX.2021.962.
96                        (vertex, value) = self.patterns.next(vertex, trie::Edge::Char(c));
97                        state = State::AfterChar(match state {
98                            State::AfterChar(n) => n + 1,
99                            State::AfterScore => 0,
100                        });
101                    }
102                }
103            }
104            if pattern.ends_with('.') {
105                (vertex, value) = self.patterns.next(vertex, trie::Edge::EndOfWord);
106            }
107            self.data.push(10);
108            *value = Some(trie::Value(data_start));
109        }
110    }
111    /// Add multiple hyphenation exceptions. These are separate words separated by whitespace, with
112    /// each word satisfying the format in [`Self::insert_exception`].
113    pub fn insert_exceptions(&mut self, hyphenated_words: &str) {
114        hyphenated_words
115            .lines()
116            .map(|l| l.trim())
117            .filter(|l| !l.is_empty())
118            .for_each(|l| {
119                self.insert_exception(l);
120            });
121    }
122    /// Add a hyphenation exception. The word is given with hyphens marking the allowed break points,
123    /// e.g. `"hy-phen-ation"`.
124    pub fn insert_exception(&mut self, hyphenated_word: &str) {
125        let mut vertex = self.patterns.root();
126        vertex = self.patterns.next(vertex, trie::Edge::StartOfWord).0;
127        let data_start = self.data.len();
128        let mut word = String::new();
129        let mut indices = vec![0];
130        self.data.push(6);
131        for c in hyphenated_word.chars() {
132            if c == '-' {
133                indices.pop();
134                indices.push(7);
135                self.data.pop();
136                self.data.push(7);
137            } else {
138                vertex = self.patterns.next(vertex, trie::Edge::Char(c)).0;
139                word.push(c);
140                indices.push(6);
141                self.data.push(6);
142            }
143        }
144        self.data.push(10);
145        let value = self.patterns.next(vertex, trie::Edge::EndOfWord).1;
146        *value = Some(trie::Value(data_start));
147    }
148    /// Hyphenate a word, returning it with `-` inserted at each valid break point.
149    pub fn hypthenate<L: LowerCaser>(&self, lower_caser: &L, word: &str, target: &mut String) {
150        let mut indices = self.calculate_indices(lower_caser, word);
151        let mut next = indices.next();
152        for (i, c) in word.chars().enumerate() {
153            if next == Some(i) {
154                target.push('-');
155                next = indices.next();
156            }
157            target.push(c);
158        }
159    }
160    /// Return the set of character indices before which a hyphen may be inserted.
161    pub fn calculate_indices<L: LowerCaser>(
162        &self,
163        lower_caser: &L,
164        word: &str,
165    ) -> impl Iterator<Item = usize> {
166        let mut total_scores = vec![0_u8; word.len() + 1];
167
168        let mut process = |mut vertex: trie::Vertex, lower: usize| {
169            let mut chars = word[lower..].chars().map(|c| lower_caser.to_lower_case(c));
170            loop {
171                let edge = match chars.next() {
172                    None => trie::Edge::EndOfWord,
173                    Some(None) => {
174                        // Some(None) occurs when the words contains a non-letter character.
175                        // In this case we stop trying to hyphenate.
176                        return;
177                    }
178                    Some(Some(c)) => trie::Edge::Char(c),
179                };
180                let pattern;
181                (vertex, pattern) = match self.patterns.next_or(vertex, edge) {
182                    None => return,
183                    Some(entry) => entry,
184                };
185                let Some(pattern) = pattern else {
186                    continue;
187                };
188                let mut k = 0;
189                for op in self.data[pattern.0..].iter() {
190                    match op {
191                        score @ ..10 => {
192                            if total_scores[lower + k] < *score {
193                                total_scores[lower + k] = *score;
194                            }
195                            k += 1;
196                        }
197                        10 => {
198                            break;
199                        }
200                        _ => {
201                            k += (op - 10) as usize;
202                        }
203                    }
204                }
205            }
206        };
207        if let Some((vertex, _)) = self
208            .patterns
209            .next_or(self.patterns.root(), trie::Edge::StartOfWord)
210        {
211            process(vertex, 0);
212        }
213        let mut num_chars: usize = 0;
214        let mut lower = 0;
215        while let Some(c) = word[lower..].chars().next() {
216            if lower_caser.to_lower_case(c).is_none() {
217                break;
218            }
219            process(self.patterns.root(), lower);
220            lower += c.len_utf8();
221            num_chars += 1;
222        }
223        total_scores[0] = 0;
224        total_scores.truncate(num_chars);
225        total_scores
226            .into_iter()
227            .enumerate()
228            .filter(|(_, score)| *score % 2 != 0)
229            .map(|(i, _)| i)
230    }
231}
232
233/// Remove all `-` characters from a word.
234pub fn strip_hyphens(word: &str) -> String {
235    word.chars().filter(|c| *c != '-').collect()
236}
237
238mod trie {
239    use std::collections::HashMap;
240
241    #[derive(Debug, Default)]
242    pub struct Trie {
243        m: HashMap<(Vertex, Edge), (Vertex, Option<Value>)>,
244        next_vertex: Vertex,
245    }
246
247    impl Trie {
248        pub fn root(&self) -> Vertex {
249            Vertex(u32::MAX)
250        }
251        pub fn next_or(&self, current: Vertex, edge: Edge) -> Option<(Vertex, Option<Value>)> {
252            self.m.get(&(current, edge)).copied()
253        }
254        pub fn next(&mut self, current: Vertex, edge: Edge) -> (Vertex, &mut Option<Value>) {
255            let (a, b) = self.m.entry((current, edge)).or_insert_with(|| {
256                let next = self.next_vertex;
257                self.next_vertex = Vertex(self.next_vertex.0 + 1);
258                (next, None)
259            });
260            (*a, b)
261        }
262    }
263
264    #[derive(Debug, PartialEq, Eq, Hash, Default, Clone, Copy)]
265    pub struct Vertex(u32);
266
267    #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
268    pub enum Edge {
269        StartOfWord,
270        Char(char),
271        EndOfWord,
272    }
273
274    #[derive(Debug, Clone, Copy)]
275    pub struct Value(pub usize);
276}
277
278#[cfg(test)]
279mod test {
280    use super::*;
281
282    macro_rules! hyphenation_tests {
283        ( $( $name:ident => $expected:literal, )* ) => {
284            $(
285                #[test]
286                #[allow(non_snake_case)]
287                fn $name() {
288                    let hyphenator = Hyphenator::plain_tex_en_us();
289                    let mut got = String::new();
290                    let lower_caser: AsciiLowerCaser = Default::default();
291                    hyphenator.hypthenate( &lower_caser, stringify!($name), &mut got);
292                    assert_eq!(got, $expected);
293                }
294            )*
295        };
296    }
297
298    hyphenation_tests!(
299        // From the TeXBook.
300        // But the TeXBook assumes that \leftminhyphen=\rightminhyphen=3 so the results are a little different.
301        record => "record",
302        hyphenation => "hy-phen-ation",
303        concatenation => "con-cate-na-tion",
304        supercalifragilisticexpialidocious => "su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious",
305        bachelor => "bach-e-lor",
306        echelon => "ech-e-lon",
307        toothaches => "toothaches",
308        campfire => "camp-fire",
309        biorhythm => "biorhyth-m",
310        algorithm => "al-go-rith-m",
311        pneumonoultramicroscopicsilicovolcanoconiosis => "p-neu-monoul-tra-mi-cro-scop-ic-sil-i-co-vol-canoco-nio-sis",
312        project => "project",
313        present => "present",
314        table => "ta-ble",
315        Table => "Ta-ble",
316        // From running the hyphenator over /usr/share/dict/words on Mac
317        ach => "ach",
318        Aaronic => "Aa-ron-ic",
319        Abelia => "A-beli-a",
320        William => "William",
321        chaffless => "chaf-f-less",
322    );
323}