1#[derive(Default)]
11pub struct Hyphenator {
12 data: Vec<u8>,
13 patterns: trie::Trie,
14}
15
16pub trait LowerCaser {
18 fn to_lower_case(&self, c: char) -> Option<char>;
21}
22
23#[derive(Default)]
26pub struct AsciiLowerCaser {}
27
28impl LowerCaser for AsciiLowerCaser {
29 fn to_lower_case(&self, c: char) -> Option<char> {
30 if c.is_ascii_alphabetic() {
31 Some(c.to_ascii_lowercase())
32 } else {
33 None
34 }
35 }
36}
37
38impl Hyphenator {
39 pub fn plain_tex_en_us() -> Self {
41 let patterns = include_str!("plain_tex_patterns.txt");
42 let exceptions = include_str!("plain_tex_exceptions.txt");
43 let mut h: Self = Default::default();
44 h.load_patterns(patterns);
45 h.insert_exceptions(exceptions);
46 h
47 }
48 pub fn load_patterns(&mut self, patterns: &str) {
50 let mut empty_value: Option<trie::Value> = None;
52 for pattern in patterns.split_whitespace() {
53 let mut vertex = self.patterns.root();
54 let mut value = &mut empty_value;
55 if pattern.starts_with('.') {
56 (vertex, value) = self.patterns.next(vertex, trie::Edge::StartOfWord);
57 }
58 let data_start = self.data.len();
59 enum State {
60 AfterChar(usize),
65 AfterScore,
66 }
67 let mut state = State::AfterChar(0);
68 for c in pattern.chars() {
69 match c {
70 '0'..='9' => {
71 match state {
72 State::AfterChar(mut n) => {
73 while n > 0 {
74 let code = (n + 10).try_into().unwrap_or(u8::MAX);
75 self.data.push(code);
76 n -= (code - 10) as usize;
77 }
78 }
79 State::AfterScore => {
80 self.data.pop();
83 }
84 }
85 self.data
86 .push((c as u32 - '0' as u32).try_into().expect("digits are <= 9"));
87 state = State::AfterScore;
88 }
89 '.' => {
90 continue;
92 }
93 _ => {
94 (vertex, value) = self.patterns.next(vertex, trie::Edge::Char(c));
97 state = State::AfterChar(match state {
98 State::AfterChar(n) => n + 1,
99 State::AfterScore => 0,
100 });
101 }
102 }
103 }
104 if pattern.ends_with('.') {
105 (vertex, value) = self.patterns.next(vertex, trie::Edge::EndOfWord);
106 }
107 self.data.push(10);
108 *value = Some(trie::Value(data_start));
109 }
110 }
111 pub fn insert_exceptions(&mut self, hyphenated_words: &str) {
114 hyphenated_words
115 .lines()
116 .map(|l| l.trim())
117 .filter(|l| !l.is_empty())
118 .for_each(|l| {
119 self.insert_exception(l);
120 });
121 }
122 pub fn insert_exception(&mut self, hyphenated_word: &str) {
125 let mut vertex = self.patterns.root();
126 vertex = self.patterns.next(vertex, trie::Edge::StartOfWord).0;
127 let data_start = self.data.len();
128 let mut word = String::new();
129 let mut indices = vec![0];
130 self.data.push(6);
131 for c in hyphenated_word.chars() {
132 if c == '-' {
133 indices.pop();
134 indices.push(7);
135 self.data.pop();
136 self.data.push(7);
137 } else {
138 vertex = self.patterns.next(vertex, trie::Edge::Char(c)).0;
139 word.push(c);
140 indices.push(6);
141 self.data.push(6);
142 }
143 }
144 self.data.push(10);
145 let value = self.patterns.next(vertex, trie::Edge::EndOfWord).1;
146 *value = Some(trie::Value(data_start));
147 }
148 pub fn hypthenate<L: LowerCaser>(&self, lower_caser: &L, word: &str, target: &mut String) {
150 let mut indices = self.calculate_indices(lower_caser, word);
151 let mut next = indices.next();
152 for (i, c) in word.chars().enumerate() {
153 if next == Some(i) {
154 target.push('-');
155 next = indices.next();
156 }
157 target.push(c);
158 }
159 }
160 pub fn calculate_indices<L: LowerCaser>(
162 &self,
163 lower_caser: &L,
164 word: &str,
165 ) -> impl Iterator<Item = usize> {
166 let mut total_scores = vec![0_u8; word.len() + 1];
167
168 let mut process = |mut vertex: trie::Vertex, lower: usize| {
169 let mut chars = word[lower..].chars().map(|c| lower_caser.to_lower_case(c));
170 loop {
171 let edge = match chars.next() {
172 None => trie::Edge::EndOfWord,
173 Some(None) => {
174 return;
177 }
178 Some(Some(c)) => trie::Edge::Char(c),
179 };
180 let pattern;
181 (vertex, pattern) = match self.patterns.next_or(vertex, edge) {
182 None => return,
183 Some(entry) => entry,
184 };
185 let Some(pattern) = pattern else {
186 continue;
187 };
188 let mut k = 0;
189 for op in self.data[pattern.0..].iter() {
190 match op {
191 score @ ..10 => {
192 if total_scores[lower + k] < *score {
193 total_scores[lower + k] = *score;
194 }
195 k += 1;
196 }
197 10 => {
198 break;
199 }
200 _ => {
201 k += (op - 10) as usize;
202 }
203 }
204 }
205 }
206 };
207 if let Some((vertex, _)) = self
208 .patterns
209 .next_or(self.patterns.root(), trie::Edge::StartOfWord)
210 {
211 process(vertex, 0);
212 }
213 let mut num_chars: usize = 0;
214 let mut lower = 0;
215 while let Some(c) = word[lower..].chars().next() {
216 if lower_caser.to_lower_case(c).is_none() {
217 break;
218 }
219 process(self.patterns.root(), lower);
220 lower += c.len_utf8();
221 num_chars += 1;
222 }
223 total_scores[0] = 0;
224 total_scores.truncate(num_chars);
225 total_scores
226 .into_iter()
227 .enumerate()
228 .filter(|(_, score)| *score % 2 != 0)
229 .map(|(i, _)| i)
230 }
231}
232
233pub fn strip_hyphens(word: &str) -> String {
235 word.chars().filter(|c| *c != '-').collect()
236}
237
238mod trie {
239 use std::collections::HashMap;
240
241 #[derive(Debug, Default)]
242 pub struct Trie {
243 m: HashMap<(Vertex, Edge), (Vertex, Option<Value>)>,
244 next_vertex: Vertex,
245 }
246
247 impl Trie {
248 pub fn root(&self) -> Vertex {
249 Vertex(u32::MAX)
250 }
251 pub fn next_or(&self, current: Vertex, edge: Edge) -> Option<(Vertex, Option<Value>)> {
252 self.m.get(&(current, edge)).copied()
253 }
254 pub fn next(&mut self, current: Vertex, edge: Edge) -> (Vertex, &mut Option<Value>) {
255 let (a, b) = self.m.entry((current, edge)).or_insert_with(|| {
256 let next = self.next_vertex;
257 self.next_vertex = Vertex(self.next_vertex.0 + 1);
258 (next, None)
259 });
260 (*a, b)
261 }
262 }
263
264 #[derive(Debug, PartialEq, Eq, Hash, Default, Clone, Copy)]
265 pub struct Vertex(u32);
266
267 #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
268 pub enum Edge {
269 StartOfWord,
270 Char(char),
271 EndOfWord,
272 }
273
274 #[derive(Debug, Clone, Copy)]
275 pub struct Value(pub usize);
276}
277
278#[cfg(test)]
279mod test {
280 use super::*;
281
282 macro_rules! hyphenation_tests {
283 ( $( $name:ident => $expected:literal, )* ) => {
284 $(
285 #[test]
286 #[allow(non_snake_case)]
287 fn $name() {
288 let hyphenator = Hyphenator::plain_tex_en_us();
289 let mut got = String::new();
290 let lower_caser: AsciiLowerCaser = Default::default();
291 hyphenator.hypthenate( &lower_caser, stringify!($name), &mut got);
292 assert_eq!(got, $expected);
293 }
294 )*
295 };
296 }
297
298 hyphenation_tests!(
299 record => "record",
302 hyphenation => "hy-phen-ation",
303 concatenation => "con-cate-na-tion",
304 supercalifragilisticexpialidocious => "su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious",
305 bachelor => "bach-e-lor",
306 echelon => "ech-e-lon",
307 toothaches => "toothaches",
308 campfire => "camp-fire",
309 biorhythm => "biorhyth-m",
310 algorithm => "al-go-rith-m",
311 pneumonoultramicroscopicsilicovolcanoconiosis => "p-neu-monoul-tra-mi-cro-scop-ic-sil-i-co-vol-canoco-nio-sis",
312 project => "project",
313 present => "present",
314 table => "ta-ble",
315 Table => "Ta-ble",
316 ach => "ach",
318 Aaronic => "Aa-ron-ic",
319 Abelia => "A-beli-a",
320 William => "William",
321 chaffless => "chaf-f-less",
322 );
323}