tfm/ligkern/
lang.rs

1//! Types corresponding to the "lig/kern programming language".
2//!
3//! See the documentation on the [`super`] module for information about this programming language.
4//!
5//! The types here are put in a separate module because users of this crate are generally not expected to use them.
6//! Instead, users will work with compiled lig/kern programs.
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11
12use crate::Char;
13use crate::FixWord;
14
15/// A lig/kern program.
16///
17/// In theory the program also includes entrypoints.
18/// However because these are provided in different ways in .tfm and .pl files,
19/// it's easier to exclude them on this type and pass them in when needed.
20#[derive(Clone, Debug, Default, PartialEq, Eq)]
21pub struct Program {
22    pub instructions: Vec<Instruction>,
23    pub left_boundary_char_entrypoint: Option<u16>,
24    pub right_boundary_char: Option<Char>,
25    pub passthrough: HashSet<u16>,
26}
27
28/// A single instruction in a lig/kern program.
29///
30/// In TFM files, instructions are serialized to 32 bit integers.
31///
32/// In property list files, instructions are specified using a `(LIG _ _)` or `(KERN _ _)` element,
33///     and optionally a `(STOP)` or `(SKIP _)` element directly after.
34#[derive(Clone, Debug, PartialEq, Eq)]
35pub struct Instruction {
36    /// Specifies the next instruction to run if this instruction is not applicable -
37    ///     e.g., if the right character of the pair is not `right_char`.
38    /// If the payload is present, that number of lig/kern instructions in the list of all instructions are skipped to
39    ///     find the next instruction.
40    /// Otherwise this is the final instruction and there are no more instructions to consider.
41    pub next_instruction: Option<u8>,
42    /// This instruction is run if the right character in the pair is this character.
43    /// Otherwise the next lig kern instruction for the current character is considered,
44    ///     using the `next_instruction` field.
45    ///
46    /// After this operation is performed,
47    ///     no more operations need to be performed on this pair.
48    /// However the result of the operation may result in a new pair being created
49    ///     and the lig/kern program will run for that pair.
50    /// See the documentation on [`PostLigOperation`] for information on that.
51    pub right_char: Char,
52    /// The operation to perform for this instruction.
53    pub operation: Operation,
54}
55
56/// A lig/kern operation to perform.
57#[derive(Debug, PartialEq, Eq, Copy, Clone)]
58pub enum Operation {
59    /// Insert a kern between the current character and the next character.
60    ///
61    /// The variant payload is the size of the kern.
62    Kern(FixWord),
63    /// Insert a kern between the current character and the next character.
64    ///
65    /// The variant payload is the index of the kern in the kerns array.
66    KernAtIndex(u16),
67    /// Perform a ligature step.
68    /// This inserts `char_to_insert` between the left and right characters,
69    ///     and then performs the post-lig operation.
70    Ligature {
71        /// Character to insert.
72        char_to_insert: Char,
73        /// What to do after inserting the character.
74        post_lig_operation: PostLigOperation,
75        /// If the tag in the .tfm file was invalid, this will be true.
76        /// In this case the post_lig_operation will be [`PostLigOperation::RetainNeitherMoveToInserted`].
77        ///
78        /// This field is used in the .tfm validation function to generate a warning for this case.
79        /// We could also generate a warning in the deserialization code itself but then the
80        /// ordering of the warning would be incorrect with respect to other validation warnings.
81        post_lig_tag_invalid: bool,
82    },
83    /// If the entrypoint for a character is this operation, go to the instruction indexed by the payload.
84    ///
85    /// This redirect mechanism exists because in .tfm files entrypoints are [`u8`]s but lig/kern
86    ///     programs can contain more than 256 instructions.
87    ///
88    /// If this operation is encountered in another situation, it is an unconditional stop.
89    ///
90    /// The boolean value in the payload is true if the boundary character should be
91    ///     serialized inside the instruction.
92    EntrypointRedirect(u16, bool),
93}
94
95/// Errors returned by the [`Operation::parse_compact`] method.
96#[derive(Debug, PartialEq)]
97pub enum ParseCompactOperationError {
98    NotThreeWords,
99    MiddleWordIsNotAnArrow,
100    FirstWordIsWrongSize,
101    LeftCharInvalid,
102    RightCharInvalid,
103    ReplacementCharInvalid,
104    InvalidDigit,
105    InvalidKern,
106    InvalidLigature,
107    MissingCursor,
108    CursorNotAfterChar,
109    MultipleCursors,
110    MissingReplacementChar,
111    MissingLeftChar,
112    MissingRightChar,
113}
114
115impl Operation {
116    /// Parses a compact representation of an operation.
117    ///
118    /// The representation is of the form `<left><right> -> <operation>`
119    /// where `<left>` and `<right>` are the pair of characters this operation applies to,
120    /// and `<operation>` describes the [`Operation`].
121    /// The value of `<left>` being `|` means that the rule is for the left boundary of a word.
122    ///
123    /// ## Kerns
124    ///
125    /// For kerns, the operation is `<left>[<n>]<right>` where `<n>` is a decimal integer
126    /// (possibly negative) giving the kern size as a [`FixWord`]:
127    ///
128    /// ```
129    /// use tfm::ligkern::lang::*;
130    /// use tfm::FixWord;
131    /// assert_eq![
132    ///     Operation::parse_compact("ab -> a[13]b"),
133    ///     Ok((Some('a'), 'b', Operation::Kern(FixWord(13))))
134    /// ];
135    /// assert_eq![
136    ///     Operation::parse_compact("ab -> a[-4]b"),
137    ///     Ok((Some('a'), 'b', Operation::Kern(FixWord(-4))))
138    /// ];
139    /// ```
140    ///
141    /// ## Ligatures
142    ///
143    /// For ligatures, the operation consists of four characters.
144    /// The following three characters come in order:
145    /// 1.  the left character if it is retained, or `_` if it is not.
146    /// 1.  the replacement character.
147    /// 1.  the right character if it is retained, or `_` if it is not.
148    ///
149    /// There is also a caret `^` indicating where the cursor should be after
150    /// the replacement. The caret can come after any character that is not `_`.
151    /// Thus:
152    ///
153    /// ```
154    /// use tfm::ligkern::lang::*;
155    /// assert_eq![
156    ///     Operation::parse_compact("ab -> ax^b"),
157    ///     Ok((Some('a'), 'b', Operation::Ligature{
158    ///         char_to_insert: 'x'.try_into().unwrap(),
159    ///         post_lig_operation: PostLigOperation::RetainBothMoveToInserted,
160    ///         post_lig_tag_invalid: false,
161    ///     }))
162    /// ];
163    /// assert_eq![
164    ///     Operation::parse_compact("ab -> _xb^"),
165    ///     Ok((Some('a'), 'b', Operation::Ligature{
166    ///         char_to_insert: 'x'.try_into().unwrap(),
167    ///         post_lig_operation: PostLigOperation::RetainRightMoveToRight,
168    ///         post_lig_tag_invalid: false,
169    ///     }))
170    /// ];
171    /// ```
172    pub fn parse_compact(
173        s: &str,
174    ) -> Result<(Option<char>, char, Self), ParseCompactOperationError> {
175        use ParseCompactOperationError::*;
176        let words: Option<[&str; 3]> = (|| {
177            let mut raw = s.split_ascii_whitespace();
178            let words = [raw.next()?, raw.next()?, raw.next()?];
179            if raw.next().is_some() {
180                None
181            } else {
182                Some(words)
183            }
184        })();
185        let Some([first, second, third]) = words else {
186            return Err(NotThreeWords);
187        };
188        if second != "->" {
189            return Err(MiddleWordIsNotAnArrow);
190        }
191        let mut c = first.chars();
192        let (l, r) = match (c.next(), c.next(), c.next()) {
193            (Some(l), Some(r), None) => (l, r),
194            _ => {
195                return Err(FirstWordIsWrongSize);
196            }
197        };
198        if l == '_' || l == '^' {
199            return Err(LeftCharInvalid);
200        }
201        if r == '_' || r == '^' {
202            return Err(RightCharInvalid);
203        }
204
205        if third.contains('[') {
206            let mut chars = third.chars();
207            let lc = chars.next().ok_or(MissingLeftChar)?;
208            if chars.next() != Some('[') {
209                return Err(InvalidKern);
210            }
211            let n_start = lc.len_utf8() + '['.len_utf8();
212            let mut n_end = n_start;
213            loop {
214                match chars.next() {
215                    None => return Err(InvalidKern),
216                    Some(']') => break,
217                    Some(c) => {
218                        n_end += c.len_utf8();
219                    }
220                }
221            }
222            let rc = chars.next().ok_or(MissingRightChar)?;
223            if chars.next().is_some() {
224                return Err(InvalidKern);
225            }
226            if lc != l {
227                return Err(MissingLeftChar);
228            }
229            if rc != r {
230                return Err(MissingRightChar);
231            }
232            let n: i32 = third[n_start..n_end].parse().map_err(|_| InvalidKern)?;
233            return Ok((
234                if l == '|' { None } else { Some(l) },
235                r,
236                Operation::Kern(FixWord(n)),
237            ));
238        }
239
240        let mut c = third.chars();
241        let c = match (c.next(), c.next(), c.next(), c.next(), c.next()) {
242            (Some(a), Some(b), Some(c), Some(d), None) => (a, b, c, d),
243            _ => {
244                return Err(InvalidLigature);
245            }
246        };
247        enum CaretPos {
248            Left,
249            Middle,
250            Right,
251        }
252        let (caret_pos, a, b, c) = match c {
253            ('^', _, _, _) => {
254                return Err(CursorNotAfterChar);
255            }
256            (a, '^', b, c) => (CaretPos::Left, a, b, c),
257            (a, b, '^', c) => (CaretPos::Middle, a, b, c),
258            (a, b, c, '^') => (CaretPos::Right, a, b, c),
259            _ => {
260                return Err(MissingCursor);
261            }
262        };
263        let retain_l = if a == l {
264            true
265        } else if a == '_' {
266            false
267        } else {
268            return Err(MissingLeftChar);
269        };
270        if b == '_' {
271            return Err(ReplacementCharInvalid);
272        }
273        let Ok(b) = b.try_into() else {
274            return Err(ReplacementCharInvalid);
275        };
276        let retain_r = if c == r {
277            true
278        } else if c == '_' {
279            false
280        } else {
281            return Err(MissingRightChar);
282        };
283        let post_lig_operation = match (retain_l, retain_r, caret_pos) {
284            (true, true, CaretPos::Left) => PostLigOperation::RetainBothMoveNowhere,
285            (true, true, CaretPos::Middle) => PostLigOperation::RetainBothMoveToInserted,
286            (true, true, CaretPos::Right) => PostLigOperation::RetainBothMoveToRight,
287            (true, false, CaretPos::Left) => PostLigOperation::RetainLeftMoveNowhere,
288            (true, false, CaretPos::Middle) => PostLigOperation::RetainLeftMoveToInserted,
289            (false, true, CaretPos::Middle) => PostLigOperation::RetainRightMoveToInserted,
290            (false, true, CaretPos::Right) => PostLigOperation::RetainRightMoveToRight,
291            (false, false, CaretPos::Middle) => PostLigOperation::RetainNeitherMoveToInserted,
292            (true, false, CaretPos::Right)
293            | (false, true, CaretPos::Left)
294            | (false, false, CaretPos::Left)
295            | (false, false, CaretPos::Right) => return Err(CursorNotAfterChar),
296        };
297        Ok((
298            if l == '|' { None } else { Some(l) },
299            r,
300            Operation::Ligature {
301                char_to_insert: b,
302                post_lig_operation,
303                post_lig_tag_invalid: false,
304            },
305        ))
306    }
307
308    /// Display the compact representation of a operation. The format is described in [`Operation::parse_compact`].
309    pub fn display_compact<'a>(&'a self, l: char, r: char) -> impl std::fmt::Display + 'a {
310        struct D<'a> {
311            s: &'a Operation,
312            l: char,
313            r: char,
314        }
315        impl<'a> std::fmt::Display for D<'a> {
316            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
317                write!(f, "{}{} -> ", self.l, self.r)?;
318                match self.s {
319                    Operation::Kern(kern) => {
320                        write!(f, "{}[{}]{}", self.l, kern.0, self.r)
321                    }
322                    Operation::KernAtIndex(_) | Operation::EntrypointRedirect(_, _) => {
323                        write!(f, "<unknown>")
324                    }
325                    Operation::Ligature {
326                        char_to_insert,
327                        post_lig_operation,
328                        post_lig_tag_invalid: _,
329                    } => {
330                        use PostLigOperation::*;
331                        match post_lig_operation {
332                            RetainBothMoveNowhere => {
333                                write!(f, "{}^{}{}", self.l, char_to_insert, self.r)
334                            }
335                            RetainBothMoveToInserted => {
336                                write!(f, "{}{}^{}", self.l, char_to_insert, self.r)
337                            }
338                            RetainBothMoveToRight => {
339                                write!(f, "{}{}{}^", self.l, char_to_insert, self.r)
340                            }
341                            RetainRightMoveToInserted => {
342                                write!(f, "_{}^{}", char_to_insert, self.r)
343                            }
344                            RetainRightMoveToRight => {
345                                write!(f, "_{}{}^", char_to_insert, self.r)
346                            }
347                            RetainLeftMoveNowhere => {
348                                write!(f, "{}^{}_", self.l, char_to_insert)
349                            }
350                            RetainLeftMoveToInserted => {
351                                write!(f, "{}{}^_", self.l, char_to_insert)
352                            }
353                            RetainNeitherMoveToInserted => {
354                                write!(f, "_{}^_", char_to_insert)
355                            }
356                        }
357                    }
358                }
359            }
360        }
361        D { s: self, l, r }
362    }
363
364    pub(crate) fn lig_kern_operation_from_bytes(op_byte: u8, remainder: u8) -> Self {
365        match op_byte.checked_sub(128) {
366            Some(r) => Self::KernAtIndex(u16::from_be_bytes([r, remainder])),
367            None => {
368                // TFtoPL.2014.77
369                let delete_next_char = op_byte.is_multiple_of(2);
370                let op_byte = op_byte / 2;
371                let delete_current_char = op_byte.is_multiple_of(2);
372                let skip = op_byte / 2;
373                use PostLigOperation::*;
374                let (post_lig_operation, post_lig_tag_invalid) =
375                    match (delete_current_char, delete_next_char, skip) {
376                        (false, false, 0) => (RetainBothMoveNowhere, false),
377                        (false, false, 1) => (RetainBothMoveToInserted, false),
378                        (false, false, 2) => (RetainBothMoveToRight, false),
379                        (false, true, 0) => (RetainLeftMoveNowhere, false),
380                        (false, true, 1) => (RetainLeftMoveToInserted, false),
381                        (true, false, 0) => (RetainRightMoveToInserted, false),
382                        (true, false, 1) => (RetainRightMoveToRight, false),
383                        (true, true, 0) => (RetainNeitherMoveToInserted, false),
384                        _ => (RetainNeitherMoveToInserted, true),
385                    };
386                Self::Ligature {
387                    char_to_insert: Char(remainder),
388                    post_lig_operation,
389                    post_lig_tag_invalid,
390                }
391            }
392        }
393    }
394}
395
396/// A post-lig operation to perform after performing a ligature operation ([`Operation::Ligature`]).
397///
398/// A lig operation starts with a pair of characters (x,y) and a "cursor" on x.
399/// The operation then inserts another character to get, say, (x,z,y).
400/// At this point the cursor is still on x.
401/// The post-lig operation does two things:
402///
403/// 1. First, it potentially deletes x or y or both.
404/// 1. Second, it potentially moves the cursor forward.
405///
406/// After this, if the cursor is not at the end of the list of characters,
407///     the lig-kern program is run for the new pair starting at the cursor.
408///
409/// For example, the post-lig operation [`PostLigOperation::RetainLeftMoveNowhere`] retains
410///     x and deletes y, leaving (x,z).
411/// It then moves the cursor nowhere, leaving it on x.
412/// As a result, the lig kern program for the pair (x,z) will run.
413///
414/// On the other hand, if the post-lig operation [`PostLigOperation::RetainLeftMoveToInserted`]
415///     runs, y is still deleted but the cursor moves to z.
416/// This is the last character in this list and there no more pairs of characters to consider.
417/// The lig/kern program thus terminates.
418///
419/// In general all of the post-lig operations are of the form `RetainXMoveY` where `X`
420///     specifies the characters to retain and `Y` specifies where the cursor should move.
421#[derive(Debug, PartialEq, Eq, Clone, Copy)]
422#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
423pub enum PostLigOperation {
424    /// Corresponds to the `/LIG/` property list element.
425    RetainBothMoveNowhere,
426    /// Corresponds to the `/LIG/>` property list element.
427    RetainBothMoveToInserted,
428    /// Corresponds to the `/LIG/>>` property list element.
429    RetainBothMoveToRight,
430    /// Corresponds to the `LIG/` property list element.
431    RetainRightMoveToInserted,
432    /// Corresponds to the `LIG/>` property list element.
433    RetainRightMoveToRight,
434    /// Corresponds to the `/LIG` property list element.
435    RetainLeftMoveNowhere,
436    /// Corresponds to the `/LIG>` property list element.
437    RetainLeftMoveToInserted,
438    /// Corresponds to the `LIG` property list element.
439    RetainNeitherMoveToInserted,
440}
441
442#[derive(Clone, Debug)]
443pub enum InvalidEntrypointError {
444    Direct { entrypoint: u8 },
445    Indirect { packed: u8, unpacked: u16 },
446}
447
448#[derive(PartialEq, Debug)]
449pub enum ParseCompactProgramError {
450    InvalidOperation(ParseCompactOperationError),
451    InvalidLeftChar,
452    InvalidRightChar,
453    TooManyOperations,
454}
455
456impl From<ParseCompactOperationError> for ParseCompactProgramError {
457    fn from(value: ParseCompactOperationError) -> Self {
458        Self::InvalidOperation(value)
459    }
460}
461
462impl Program {
463    pub fn parse_compact(s: &str) -> Result<(Self, HashMap<Char, u16>), ParseCompactProgramError> {
464        use ParseCompactProgramError::*;
465        let mut operations: BTreeMap<Option<Char>, BTreeMap<Char, Operation>> = Default::default();
466        let mut right_boundary_char: Option<Char> = None;
467        for line in s.lines() {
468            let line = line.trim();
469            if line.is_empty() {
470                continue;
471            }
472            let (l, r, op) = Operation::parse_compact(line)?;
473            let l: Option<Char> = match l {
474                None => None,
475                Some(l) => {
476                    let Ok(l) = l.try_into() else {
477                        return Err(InvalidLeftChar);
478                    };
479                    Some(l)
480                }
481            };
482            if r == '|' {
483                right_boundary_char = Some('|'.try_into().expect("| is ASCII"));
484            }
485
486            let m = operations.entry(l).or_default();
487            let Ok(r) = r.try_into() else {
488                return Err(InvalidRightChar);
489            };
490            m.insert(r, op);
491        }
492        let mut entrypoints: HashMap<Char, u16> = Default::default();
493        let mut left_boundary_char_entrypoint: Option<u16> = None;
494        let mut instructions: Vec<Instruction> = vec![];
495        for (left_char, m) in operations {
496            let i: u16 = instructions
497                .len()
498                .try_into()
499                .map_err(|_| TooManyOperations)?;
500            match left_char {
501                None => left_boundary_char_entrypoint = Some(i),
502                Some(left_char) => {
503                    entrypoints.insert(left_char, i);
504                }
505            }
506            let mut has_operations = false;
507            for (right_char, operation) in m {
508                instructions.push(Instruction {
509                    next_instruction: Some(0),
510                    right_char,
511                    operation,
512                });
513                has_operations = true;
514            }
515            if has_operations {
516                instructions
517                    .last_mut()
518                    .expect("we wrote at least one instruction")
519                    .next_instruction = None;
520            }
521        }
522        Ok((
523            Self {
524                instructions,
525                left_boundary_char_entrypoint,
526                right_boundary_char,
527                passthrough: Default::default(),
528            },
529            entrypoints,
530        ))
531    }
532
533    pub fn unpack_entrypoint(&mut self, entrypoint: u8) -> Result<u16, InvalidEntrypointError> {
534        match self.instructions.get(entrypoint as usize) {
535            None => Err(InvalidEntrypointError::Direct { entrypoint }),
536            Some(instruction) => match instruction.operation {
537                Operation::EntrypointRedirect(u_big, _) => {
538                    if (u_big as usize) < self.instructions.len() {
539                        self.passthrough.insert(entrypoint as u16);
540                        Ok(u_big)
541                    } else {
542                        Err(InvalidEntrypointError::Indirect {
543                            packed: entrypoint,
544                            unpacked: u_big,
545                        })
546                    }
547                }
548                _ => Ok(entrypoint as u16),
549            },
550        }
551    }
552
553    pub fn pack_entrypoints(&mut self, entrypoints: HashMap<Char, u16>) -> HashMap<Char, u8> {
554        let instructions = &mut self.instructions;
555        let ordered_entrypoints = {
556            let mut m: HashMap<u16, Vec<Char>> = Default::default();
557            for (c, u) in entrypoints {
558                m.entry(u).or_default().push(c);
559            }
560            let mut v: Vec<(u16, Vec<Char>)> = m.into_iter().collect();
561            v.sort_by_key(|(u, _)| *u);
562            v
563        };
564        let mut offset: u8 = if self.right_boundary_char.is_some() {
565            // In .tfm files the boundary char is transmitted in each entrypoint redirect instruction.
566            // If there is a boundary char, we need at least one entrypoint redirect to exist so
567            // that the boundary char is there.
568            instructions.push(Instruction {
569                next_instruction: None,
570                right_char: self.right_boundary_char.unwrap_or(Char(0)),
571                operation: Operation::EntrypointRedirect(0, true),
572            });
573            1
574        } else {
575            0
576        };
577        let mut new_entrypoints: HashMap<Char, u8> = Default::default();
578        let mut redirects: Vec<u16> = vec![];
579        for (i, (u16_entrypoint, chars)) in ordered_entrypoints.into_iter().rev().enumerate() {
580            let u: u8 = match (u16_entrypoint + offset as u16).try_into() {
581                Ok(u) => u,
582                Err(_) => {
583                    redirects.push(u16_entrypoint);
584                    if i == 0 && self.right_boundary_char.is_some() {
585                        // This implements the "optimization" "location 0 can do double duty" in PLtoTF.2014.141
586                        instructions.pop();
587                        offset = 0;
588                    }
589                    let u = offset;
590                    offset = offset.checked_add(1).expect(
591                        "offset is incremented at most once per 8-bit-char and so cannot exceed 256",
592                    );
593                    u
594                }
595            };
596            for c in chars {
597                new_entrypoints.insert(c, u);
598            }
599        }
600        for redirect in redirects {
601            instructions.push(Instruction {
602            next_instruction: None,
603            right_char: self.right_boundary_char.unwrap_or(Char(0)),
604            operation: Operation::EntrypointRedirect(
605                redirect.checked_add(offset as u16).expect("the inputted lig/kern instructions vector doesn't have enough space for new instructions"),
606            true,
607        ),
608        });
609        }
610        instructions.rotate_right(offset as usize);
611        if let Some(boundary_char_entrypoint) = self.left_boundary_char_entrypoint {
612            instructions.push(Instruction {
613                next_instruction: None,
614                right_char: Char(0),
615                operation: Operation::EntrypointRedirect(
616                    boundary_char_entrypoint + offset as u16,
617                    false,
618                ),
619            })
620        }
621        new_entrypoints
622    }
623
624    pub fn unpack_kerns(&mut self) -> Vec<FixWord> {
625        let mut kerns = vec![];
626        let mut kerns_dedup = HashMap::<FixWord, usize>::new();
627        for instruction in &mut self.instructions {
628            if let Operation::Kern(kern) = instruction.operation {
629                use std::collections::hash_map::Entry;
630                let index = match kerns_dedup.entry(kern) {
631                    Entry::Occupied(o) => *o.get(),
632                    Entry::Vacant(v) => {
633                        let l = kerns.len();
634                        v.insert(l);
635                        kerns.push(kern);
636                        l
637                    }
638                };
639                instruction.operation = Operation::KernAtIndex(index.try_into().unwrap());
640            }
641        }
642        kerns
643    }
644
645    pub fn pack_kerns(&mut self, kerns: &[FixWord]) {
646        for i in &mut self.instructions {
647            if let Operation::KernAtIndex(index) = &i.operation {
648                // TODO: log a warning if the index is not in the kerns array as
649                // in TFtoPL.2014.76
650                i.operation =
651                    Operation::Kern(kerns.get(*index as usize).copied().unwrap_or_default())
652            }
653        }
654    }
655
656    pub fn reachable_iter<I: Iterator<Item = (Char, u16)>>(
657        &self,
658        entrypoints: I,
659    ) -> ReachableIter<'_> {
660        ReachableIter {
661            next: 0,
662            reachable: self.reachable_array(entrypoints),
663            program: self,
664        }
665    }
666
667    /// Iterate over the lig/kern instructions for a specific entrypoint.
668    pub fn instructions_for_entrypoint(
669        &self,
670        entrypoint: u16,
671    ) -> InstructionsForEntrypointIter<'_> {
672        InstructionsForEntrypointIter {
673            next: entrypoint as usize,
674            instructions: &self.instructions,
675        }
676    }
677
678    pub fn is_seven_bit_safe(&self, entrypoints: HashMap<Char, u16>) -> bool {
679        entrypoints
680            .into_iter()
681            .filter(|(c, _)| c.is_seven_bit())
682            .flat_map(|(_, e)| self.instructions_for_entrypoint(e))
683            .filter(|(_, instruction)| instruction.right_char.is_seven_bit())
684            .filter_map(|(_, instruction)| match instruction.operation {
685                Operation::Ligature { char_to_insert, .. } => Some(char_to_insert),
686                _ => None,
687            })
688            .all(|c| c.is_seven_bit())
689    }
690
691    pub fn validate_and_fix<I, T>(
692        &mut self,
693        smallest_char: Char,
694        entrypoints: I,
695        char_exists: T,
696        kerns: &[FixWord],
697    ) -> Vec<ValidationWarning>
698    where
699        I: Iterator<Item = (Char, u8)>,
700        T: Fn(Char) -> bool,
701    {
702        let mut warnings = vec![];
703        // TFtoPL.2014.68
704        if let Some(entrypoint) = self.left_boundary_char_entrypoint {
705            if self.instructions.len() <= entrypoint as usize {
706                self.left_boundary_char_entrypoint = None;
707                warnings.push(ValidationWarning::InvalidBoundaryCharEntrypoint);
708            }
709        }
710        // TFtoPL.2014.69
711        let unpacked_entrypoints: Vec<(Char, u16)> = entrypoints
712            .into_iter()
713            .filter_map(|(c, e)| match self.unpack_entrypoint(e) {
714                Ok(e) => Some((c, e)),
715                Err(_) => {
716                    warnings.push(ValidationWarning::InvalidEntrypoint(c));
717                    None
718                }
719            })
720            .collect();
721        let reachable = self.reachable_array(unpacked_entrypoints.iter().cloned());
722        let n = self.instructions.len();
723        // TFtoPL.2014.70
724        self.instructions
725            .iter_mut()
726            .zip(reachable.iter())
727            .enumerate()
728            .filter(|(_, (_, reachable))| **reachable)
729            .filter_map(|(i, (instruction, _))| {
730                instruction
731                    .next_instruction
732                    .map(|inc| (i, inc, instruction))
733            })
734            .filter(|(i, inc, _)| *i + (*inc as usize) + 1 >= n)
735            .for_each(|(i, _, instruction)| {
736                instruction.next_instruction = None;
737                warnings.push(ValidationWarning::SkipTooLarge(i));
738            });
739
740        for (i, (instruction, reachable)) in self.instructions.iter_mut().zip(reachable).enumerate()
741        {
742            let is_kern_step = match instruction.operation {
743                Operation::Kern(_) | Operation::KernAtIndex(_) => true,
744                Operation::Ligature { .. } => false,
745                Operation::EntrypointRedirect(r, _) => {
746                    if let Ok(u) = i.try_into() {
747                        // If it's a passthrough instruction, don't issue a warning.
748                        if !reachable && self.passthrough.contains(&u) {
749                            continue;
750                        }
751                    }
752                    if r as usize >= n {
753                        warnings.push(ValidationWarning::EntrypointRedirectTooBig(i));
754                    }
755                    continue;
756                }
757            };
758            if !char_exists(instruction.right_char)
759                && Some(instruction.right_char) != self.right_boundary_char
760            {
761                warnings.push(if is_kern_step {
762                    ValidationWarning::KernStepForNonExistentCharacter {
763                        instruction_index: i,
764                        right_char: instruction.right_char,
765                        new_right_char: smallest_char,
766                    }
767                } else {
768                    ValidationWarning::LigatureStepForNonExistentCharacter {
769                        instruction_index: i,
770                        right_char: instruction.right_char,
771                        new_right_char: smallest_char,
772                    }
773                });
774                instruction.right_char = smallest_char;
775            }
776            match &mut instruction.operation {
777                Operation::Kern(_) => {}
778                Operation::KernAtIndex(k) => {
779                    if *k as usize >= kerns.len() {
780                        warnings.push(ValidationWarning::KernIndexTooBig(i));
781                        instruction.operation = Operation::Kern(FixWord::ZERO);
782                    }
783                }
784                Operation::Ligature {
785                    char_to_insert,
786                    post_lig_tag_invalid,
787                    ..
788                } => {
789                    if !char_exists(*char_to_insert) {
790                        warnings.push(
791                            ValidationWarning::LigatureStepProducesNonExistentCharacter {
792                                instruction_index: i,
793                                replacement_char: *char_to_insert,
794                                new_replacement_char: smallest_char,
795                            },
796                        );
797                        *char_to_insert = smallest_char;
798                    }
799                    if *post_lig_tag_invalid {
800                        warnings.push(ValidationWarning::InvalidLigTag(i));
801                        *post_lig_tag_invalid = false;
802                    }
803                }
804                Operation::EntrypointRedirect(_, _) => {}
805            }
806        }
807
808        let entrypoints: HashMap<Char, u16> = unpacked_entrypoints.into_iter().collect();
809        // We are only running the compiler to find errors and are disregarding the actual
810        // result. Thus the true design size is not needed. Using this dummy avoids having
811        // to plumb in the real design size into this function.
812        let dummy_design_size = FixWord::ONE * 10;
813        let (_, errors) =
814            super::CompiledProgram::compile(self, dummy_design_size, kerns, entrypoints);
815        for err in errors {
816            warnings.push(ValidationWarning::InfiniteLoop(err));
817        }
818        warnings
819    }
820
821    fn reachable_array<I: Iterator<Item = (Char, u16)>>(&self, entrypoints: I) -> Vec<bool> {
822        let mut reachable = vec![false; self.instructions.len()];
823        // TFtoPL.2014.68
824        for (_, entrypoint) in entrypoints {
825            if let Some(slot) = reachable.get_mut(entrypoint as usize) {
826                *slot = true;
827            }
828        }
829        // TFtoPL.2014.69
830        if let Some(entrypoint) = self.left_boundary_char_entrypoint {
831            // There is a bug (?) in Knuth's TFtoPL when the entrypoint for the boundary char
832            // points at the last instruction - i.e., the instruction containing the
833            // boundary char entrypoint. In this case the last instruction is marked as passthrough
834            // and not accessible.
835            if entrypoint as usize != self.instructions.len() - 1 {
836                if let Some(slot) = reachable.get_mut(entrypoint as usize) {
837                    *slot = true;
838                }
839            }
840        }
841        // TFtoPL.2014.70
842        for i in 0..reachable.len() {
843            if !reachable[i] {
844                continue;
845            }
846            if let Some(inc) = self.instructions[i].next_instruction {
847                if let Some(slot) = reachable.get_mut(i + inc as usize + 1) {
848                    *slot = true;
849                }
850            }
851        }
852        reachable
853    }
854}
855
856#[derive(Clone, Debug, PartialEq, Eq)]
857pub enum ValidationWarning {
858    SkipTooLarge(usize),
859    LigatureStepForNonExistentCharacter {
860        // Index of the buggy lig instruction in the program.
861        instruction_index: usize,
862        // Right character that is non existent.
863        right_char: Char,
864        // New right character after the buggy instruction is fixed.
865        //
866        // This is set to bc by tftopl. Note there is no guarantee that bc
867        // itself exists, so the instruction may still be buggy.
868        new_right_char: Char,
869    },
870    KernStepForNonExistentCharacter {
871        // Index of the buggy kern instruction in the program.
872        instruction_index: usize,
873        // Right character that is non existent.
874        right_char: Char,
875        // New right character after the buggy instruction is fixed.
876        //
877        // This is set to bc by tftopl. Note there is no guarantee that bc
878        // itself exists, so the instruction may still be buggy.
879        new_right_char: Char,
880    },
881    LigatureStepProducesNonExistentCharacter {
882        // Index of the buggy kern instruction in the program.
883        instruction_index: usize,
884        // Replacement character that is non existent.
885        replacement_char: Char,
886        // New replacement character after the buggy instruction is fixed.
887        //
888        // This is set to bc by tftopl. Note there is no guarantee that bc
889        // itself exists, so the instruction may not be fully fixed.
890        new_replacement_char: Char,
891    },
892    KernIndexTooBig(usize),
893    InvalidLigTag(usize),
894    EntrypointRedirectTooBig(usize),
895    InvalidEntrypoint(Char),
896    InvalidBoundaryCharEntrypoint,
897    InfiniteLoop(super::InfiniteLoopError),
898}
899
900impl ValidationWarning {
901    /// Returns the warning message the TFtoPL program prints for this kind of error.
902    pub fn tftopl_message(&self) -> String {
903        use ValidationWarning::*;
904        match self {
905            SkipTooLarge(i) => {
906                format!["Bad TFM file: Ligature/kern step {i} skips too far;\nI made it stop."]
907            }
908            LigatureStepForNonExistentCharacter { right_char, .. } => format![
909                "Bad TFM file: Ligature step for nonexistent character '{:03o}.",
910                right_char.0
911            ],
912            KernStepForNonExistentCharacter { right_char, .. } => format![
913                "Bad TFM file: Kern step for nonexistent character '{:03o}.",
914                right_char.0
915            ],
916            LigatureStepProducesNonExistentCharacter {
917                replacement_char, ..
918            } => format![
919                "Bad TFM file: Ligature step produces the nonexistent character '{:03o}.",
920                replacement_char.0
921            ],
922            KernIndexTooBig(_) => "Bad TFM file: Kern index too large.".to_string(),
923            InvalidLigTag(_) => "Ligature step with nonstandard code changed to LIG".to_string(),
924            EntrypointRedirectTooBig(_) => {
925                "Bad TFM file: Ligature unconditional stop command address is too big.".to_string()
926            }
927            InvalidEntrypoint(c) => {
928                format![" \nLigature/kern starting index for character '{:03o} is too large;\nso I removed it.", c.0]
929            }
930            InvalidBoundaryCharEntrypoint => {
931                " \nLigature/kern starting index for boundarychar is too large;so I removed it."
932                    .to_string()
933            }
934            InfiniteLoop(err) => err.pltotf_message(),
935        }
936    }
937
938    /// Returns the section in Knuth's TFtoPL (version 2014) in which this warning occurs.
939    pub fn tftopl_section(&self) -> u8 {
940        use ValidationWarning::*;
941        match self {
942            SkipTooLarge(_) => 70,
943            LigatureStepForNonExistentCharacter { .. }
944            | LigatureStepProducesNonExistentCharacter { .. }
945            | InvalidLigTag(_) => 77,
946            KernStepForNonExistentCharacter { .. } | KernIndexTooBig(_) => 76,
947            EntrypointRedirectTooBig(_) => 74,
948            InvalidEntrypoint(_) => 67,
949            InvalidBoundaryCharEntrypoint => 69,
950            InfiniteLoop(_) => 90,
951        }
952    }
953}
954
955pub struct ReachableIter<'a> {
956    next: u16,
957    reachable: Vec<bool>,
958    program: &'a Program,
959}
960
961#[derive(Debug)]
962pub enum ReachableIterItem {
963    Reachable { adjusted_skip: Option<u8> },
964    Unreachable,
965    Passthrough,
966}
967
968impl<'a> Iterator for ReachableIter<'a> {
969    type Item = ReachableIterItem;
970
971    fn next(&mut self) -> Option<Self::Item> {
972        let this = self.next;
973        let instruction = self.program.instructions.get(this as usize)?;
974        self.next += 1;
975        Some(if self.reachable[this as usize] {
976            let adjusted_skip = match instruction.next_instruction {
977                None | Some(0) => None,
978                Some(inc) => {
979                    match self
980                        .reachable
981                        .get(this as usize + 1..this as usize + 1 + inc as usize)
982                    {
983                        None => None,
984                        Some(n) => {
985                            let reachable_skipped: u8 =
986                                n.iter()
987                                .filter(|reachable| **reachable)
988                                .count()
989                                .try_into()
990                                .expect("iterating over at most u8::MAX elements, so the count will be at most u8::MAX");
991                            Some(reachable_skipped)
992                        }
993                    }
994                }
995            };
996            ReachableIterItem::Reachable { adjusted_skip }
997        } else if self.program.passthrough.contains(&this) {
998            ReachableIterItem::Passthrough
999        } else {
1000            ReachableIterItem::Unreachable
1001        })
1002    }
1003}
1004
1005/// Iterator over the lig/kern instructions for a specific entrypoint.
1006///
1007/// Create using [`Program::instructions_for_entrypoint`].
1008pub struct InstructionsForEntrypointIter<'a> {
1009    next: usize,
1010    instructions: &'a [Instruction],
1011}
1012
1013impl<'a> Iterator for InstructionsForEntrypointIter<'a> {
1014    type Item = (usize, &'a Instruction);
1015
1016    fn next(&mut self) -> Option<Self::Item> {
1017        self.instructions.get(self.next).map(|i| {
1018            let this = self.next;
1019            self.next = match i.next_instruction {
1020                None => usize::MAX,
1021                Some(inc) => self.next + inc as usize + 1,
1022            };
1023            (this, i)
1024        })
1025    }
1026}
1027
1028#[cfg(test)]
1029mod tests {
1030    use super::*;
1031
1032    macro_rules! parse_compact_program_tests {
1033        ( $( ($name: ident, $input: expr, $want: expr, ),)+ ) => {
1034            mod parse_compact_program {
1035                use super::*;
1036                use ParseCompactProgramError::*;
1037            $(
1038                #[test]
1039                fn $name() {
1040                    let want: Result<(Program, HashMap<Char, u16>), ParseCompactProgramError> = $want;
1041                    assert_eq![
1042                        Program::parse_compact($input),
1043                        want,
1044                    ];
1045                }
1046            )+
1047            }
1048        };
1049    }
1050
1051    parse_compact_program_tests!(
1052        (
1053            ok,
1054            "
1055                ac -> _y^_
1056                ab -> axb^
1057                bc -> b[1]c
1058            ",
1059            Ok((
1060                Program {
1061                    instructions: vec![
1062                        Instruction {
1063                            next_instruction: Some(0),
1064                            right_char: 'b'.try_into().unwrap(),
1065                            operation: Operation::Ligature {
1066                                char_to_insert: 'x'.try_into().unwrap(),
1067                                post_lig_operation: PostLigOperation::RetainBothMoveToRight,
1068                                post_lig_tag_invalid: false,
1069                            }
1070                        },
1071                        Instruction {
1072                            next_instruction: None,
1073                            right_char: 'c'.try_into().unwrap(),
1074                            operation: Operation::Ligature {
1075                                char_to_insert: 'y'.try_into().unwrap(),
1076                                post_lig_operation: PostLigOperation::RetainNeitherMoveToInserted,
1077                                post_lig_tag_invalid: false,
1078                            }
1079                        },
1080                        Instruction {
1081                            next_instruction: None,
1082                            right_char: 'c'.try_into().unwrap(),
1083                            operation: Operation::Kern(FixWord(1)),
1084                        },
1085                    ],
1086                    left_boundary_char_entrypoint: None,
1087                    right_boundary_char: None,
1088                    passthrough: Default::default(),
1089                },
1090                HashMap::from([('a'.try_into().unwrap(), 0), ('b'.try_into().unwrap(), 2),])
1091            )),
1092        ),
1093        (invalid_left_char, "αb -> _c^_", Err(InvalidLeftChar),),
1094        (invalid_right_char, "bα -> _c^_", Err(InvalidRightChar),),
1095    );
1096
1097    macro_rules! parse_compact_operation_tests {
1098        ( $( ($name: ident, $input: expr, $want: expr, ),)+ ) => {
1099            mod parse_compact_operation {
1100                use super::*;
1101                use ParseCompactOperationError::*;
1102            $(
1103                #[test]
1104                fn $name() {
1105                    let want: Result<(Option<char>, char, Operation), ParseCompactOperationError> = $want;
1106                    assert_eq![
1107                        Operation::parse_compact($input),
1108                        want,
1109                    ];
1110                    if let Ok((l, r, op)) = want {
1111                        let s = format!["{}", op.display_compact(l.unwrap_or('|'), r)];
1112                        assert_eq![s, $input];
1113                    }
1114                }
1115            )+
1116            }
1117        };
1118    }
1119
1120    parse_compact_operation_tests!(
1121        (one_word, "ab", Err(NotThreeWords),),
1122        (four_words, "ab -> axb^ four", Err(NotThreeWords),),
1123        (not_an_arrow, "ab %% axb^", Err(MiddleWordIsNotAnArrow),),
1124        (first_too_small, "a -> axb^", Err(FirstWordIsWrongSize),),
1125        (first_too_big, "abc -> axb^", Err(FirstWordIsWrongSize),),
1126        (left_char_invalid, "_b -> _xb^", Err(LeftCharInvalid),),
1127        (right_char_invalid, "a_ -> ax_^", Err(RightCharInvalid),),
1128        (
1129            cursor_not_after_char_1,
1130            "ab -> ^_x_",
1131            Err(CursorNotAfterChar),
1132        ),
1133        (
1134            cursor_not_after_char_2,
1135            "ab -> _^x_",
1136            Err(CursorNotAfterChar),
1137        ),
1138        (
1139            cursor_not_after_char_3,
1140            "ab -> _x_^",
1141            Err(CursorNotAfterChar),
1142        ),
1143        (
1144            cursor_not_after_char_4,
1145            "ab -> _^xb",
1146            Err(CursorNotAfterChar),
1147        ),
1148        (
1149            cursor_not_after_char_5,
1150            "ab -> ax_^",
1151            Err(CursorNotAfterChar),
1152        ),
1153        (
1154            lig_1,
1155            "ab -> a^xb",
1156            Ok((
1157                Some('a'),
1158                'b',
1159                Operation::Ligature {
1160                    char_to_insert: 'x'.try_into().unwrap(),
1161                    post_lig_operation: PostLigOperation::RetainBothMoveNowhere,
1162                    post_lig_tag_invalid: false,
1163                }
1164            )),
1165        ),
1166        (
1167            lig_2,
1168            "ab -> ax^b",
1169            Ok((
1170                Some('a'),
1171                'b',
1172                Operation::Ligature {
1173                    char_to_insert: 'x'.try_into().unwrap(),
1174                    post_lig_operation: PostLigOperation::RetainBothMoveToInserted,
1175                    post_lig_tag_invalid: false,
1176                }
1177            )),
1178        ),
1179        (
1180            lig_3,
1181            "ab -> axb^",
1182            Ok((
1183                Some('a'),
1184                'b',
1185                Operation::Ligature {
1186                    char_to_insert: 'x'.try_into().unwrap(),
1187                    post_lig_operation: PostLigOperation::RetainBothMoveToRight,
1188                    post_lig_tag_invalid: false,
1189                }
1190            )),
1191        ),
1192        (
1193            lig_4,
1194            "ab -> _x^b",
1195            Ok((
1196                Some('a'),
1197                'b',
1198                Operation::Ligature {
1199                    char_to_insert: 'x'.try_into().unwrap(),
1200                    post_lig_operation: PostLigOperation::RetainRightMoveToInserted,
1201                    post_lig_tag_invalid: false,
1202                }
1203            )),
1204        ),
1205        (
1206            lig_5,
1207            "ab -> _xb^",
1208            Ok((
1209                Some('a'),
1210                'b',
1211                Operation::Ligature {
1212                    char_to_insert: 'x'.try_into().unwrap(),
1213                    post_lig_operation: PostLigOperation::RetainRightMoveToRight,
1214                    post_lig_tag_invalid: false,
1215                }
1216            )),
1217        ),
1218        (
1219            lig_6,
1220            "ab -> a^x_",
1221            Ok((
1222                Some('a'),
1223                'b',
1224                Operation::Ligature {
1225                    char_to_insert: 'x'.try_into().unwrap(),
1226                    post_lig_operation: PostLigOperation::RetainLeftMoveNowhere,
1227                    post_lig_tag_invalid: false,
1228                }
1229            )),
1230        ),
1231        (
1232            lig_7,
1233            "ab -> ax^_",
1234            Ok((
1235                Some('a'),
1236                'b',
1237                Operation::Ligature {
1238                    char_to_insert: 'x'.try_into().unwrap(),
1239                    post_lig_operation: PostLigOperation::RetainLeftMoveToInserted,
1240                    post_lig_tag_invalid: false,
1241                }
1242            )),
1243        ),
1244        (
1245            lig_8,
1246            "ab -> _x^_",
1247            Ok((
1248                Some('a'),
1249                'b',
1250                Operation::Ligature {
1251                    char_to_insert: 'x'.try_into().unwrap(),
1252                    post_lig_operation: PostLigOperation::RetainNeitherMoveToInserted,
1253                    post_lig_tag_invalid: false,
1254                }
1255            )),
1256        ),
1257        (
1258            lig_left_boundary_char,
1259            "|b -> |x^b",
1260            Ok((
1261                None,
1262                'b',
1263                Operation::Ligature {
1264                    char_to_insert: 'x'.try_into().unwrap(),
1265                    post_lig_operation: PostLigOperation::RetainBothMoveToInserted,
1266                    post_lig_tag_invalid: false,
1267                }
1268            )),
1269        ),
1270        (
1271            kern_1,
1272            "ab -> a[13]b",
1273            Ok((Some('a'), 'b', Operation::Kern(FixWord(13)),)),
1274        ),
1275        (
1276            kern_2,
1277            "ab -> a[-789]b",
1278            Ok((Some('a'), 'b', Operation::Kern(FixWord(-789)),)),
1279        ),
1280        (
1281            kern_left_boundary_char,
1282            "|b -> |[-789]b",
1283            Ok((None, 'b', Operation::Kern(FixWord(-789)),)),
1284        ),
1285        (kern_char_before_bracket, "ab -> ax[1]b", Err(InvalidKern),),
1286        (kern_no_close_bracket, "ab -> a[13", Err(InvalidKern),),
1287        (kern_no_right_char, "ab -> a[13]", Err(MissingRightChar),),
1288        (kern_trailing_chars, "ab -> a[13]bx", Err(InvalidKern),),
1289        (kern_wrong_left_char, "ab -> c[13]b", Err(MissingLeftChar),),
1290        (kern_wrong_right_char, "ab -> a[13]c", Err(MissingRightChar),),
1291        (kern_invalid_number, "ab -> a[xyz]b", Err(InvalidKern),),
1292        (kern_empty_number, "ab -> a[]b", Err(InvalidKern),),
1293        (third_too_small, "ab -> axb", Err(InvalidLigature),),
1294        (third_too_big, "ab -> axbc^", Err(InvalidLigature),),
1295        (missing_cursor, "ab -> axb_", Err(MissingCursor),),
1296        (
1297            replacement_char_is_underscore,
1298            "ab -> a^_b",
1299            Err(ReplacementCharInvalid),
1300        ),
1301        (
1302            replacement_char_out_of_range,
1303            "ab -> a^αb",
1304            Err(ReplacementCharInvalid),
1305        ),
1306        (missing_left_char, "ab -> cxb^", Err(MissingLeftChar),),
1307        (missing_right_char, "ab -> axc^", Err(MissingRightChar),),
1308        (left_char_is_caret, "^b -> ^xb^", Err(LeftCharInvalid),),
1309        (right_char_is_caret, "a^ -> ax^^", Err(RightCharInvalid),),
1310    );
1311}