tfm/pl/
cst.rs

1//! Concrete syntax tree for property list files
2//!
3//! The CST, or parse tree, of a property list file mostly just represents the nesting of parenthesis.
4//! The AST is the most useful representation.
5//!
6//! Rough form of the production rules, where `*` means 0 or more, `+` means at least 1, `?` means optional:
7//!
8//! - `<property list> -> <list element>*`
9//! - `<list element> -> <open parenthesis><word>(<whitespace>+<word>)*<property list><closed parentheses>`
10//! - `<word>` -> ASCII string without whitespace or parentheses. Can be empty.
11//! - `<whitespace> -> <space><whitespace>? | <newline><whitespace>?`
12//! - `<open parenthesis> -> '('<whitespace>?`
13//! - `<closed parenthesis> -> ')'<whitespace>?`
14use super::error::{ParseWarning, ParseWarningKind};
15
16/// Concrete syntax tree for property list files
17#[derive(Debug, PartialEq, Eq)]
18pub struct Cst(pub Vec<Node>);
19
20impl Cst {
21    /// Build an CST directly from source code.
22    pub fn from_pl_source_code(source: &str) -> (Cst, Vec<ParseWarning>) {
23        parse(source)
24    }
25
26    /// Display the CST.
27    pub fn display(
28        &self,
29        starting_indent: usize,
30        additional_indent: usize,
31    ) -> impl std::fmt::Display + '_ {
32        DisplaySlice {
33            nodes: &self.0,
34            starting_indent,
35            additional_indent,
36        }
37    }
38}
39
40/// Helper type for displaying slices of nodes.
41struct DisplaySlice<'a> {
42    nodes: &'a [Node],
43    starting_indent: usize,
44    additional_indent: usize,
45}
46
47impl<'a> std::fmt::Display for DisplaySlice<'a> {
48    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
49        for node in self.nodes {
50            write!(
51                f,
52                "{}",
53                node.display(self.starting_indent, self.additional_indent)
54            )?;
55        }
56        Ok(())
57    }
58}
59
60/// Helper type for displaying nodes.
61struct DisplayNode<'a> {
62    node: &'a Node,
63    starting_indent: usize,
64    additional_indent: usize,
65}
66
67impl Node {
68    /// Display the node.
69    pub fn display(
70        &self,
71        starting_indent: usize,
72        additional_indent: usize,
73    ) -> impl std::fmt::Display + '_ {
74        DisplayNode {
75            node: self,
76            starting_indent,
77            additional_indent,
78        }
79    }
80}
81
82impl<'a> std::fmt::Display for DisplayNode<'a> {
83    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
84        let starting_indent = self.starting_indent;
85        let additional_indent = self.additional_indent;
86        match &self.node {
87            // TODO: better handle all of the cases here with preceding newlines and trailing newlines
88            Node::Comment(e) => {
89                if e.contains('\n') {
90                    // TODO this assumes e starts and ends with a newline?
91                    // these kinds of bugs will appear if we write a PLST formatter
92                    write!(f, "{}(COMMENT{e}", " ".repeat(starting_indent))?;
93                    writeln!(f, "{})", " ".repeat(starting_indent + additional_indent))?;
94                } else {
95                    writeln!(f, "{}(COMMENT {e})", " ".repeat(starting_indent))?;
96                }
97            }
98            Node::Regular(v) => {
99                write!(f, "{}({}", " ".repeat(starting_indent), &v.key)?;
100                if let Some(data) = &v.data {
101                    write!(f, " {}", data)?;
102                }
103                if let Some(children) = &v.children {
104                    writeln!(f)?;
105                    for child in children {
106                        write!(
107                            f,
108                            "{}",
109                            child.display(starting_indent + additional_indent, additional_indent)
110                        )?;
111                    }
112                    write!(f, "{}", " ".repeat(starting_indent + additional_indent))?;
113                }
114                writeln!(f, ")")?;
115            }
116        }
117        Ok(())
118    }
119}
120
121/// Node in the CST.
122#[derive(Debug, PartialEq, Eq)]
123pub enum Node {
124    /// A comment node.
125    Comment(String),
126    /// A regular non-comment node.
127    Regular(RegularNode),
128}
129
130/// Value of a regular node in the CST.
131#[derive(Debug, PartialEq, Eq)]
132pub struct RegularNode {
133    /// Span of the opening parenthesis for this node in the source file.
134    ///
135    /// It always has length 1.
136    pub opening_parenthesis_span: std::ops::Range<usize>,
137    /// Key of the node; e.g., `CHECKSUM`.
138    pub key: String,
139    /// Span of the key in the source file
140    pub key_span: std::ops::Range<usize>,
141    /// Data of the node.
142    ///
143    /// The values `None` and `Some("".to_string())` are semantically the same.
144    /// However they are displayed differently.
145    /// For example the stop node is displayed as `(STOP)` but an empty coding scheme node
146    ///     is displayed as `(CODINGSCHEME )`.
147    pub data: Option<String>,
148    /// Span of the raw data in the source file.
149    pub data_span: std::ops::Range<usize>,
150    /// Child nodes, for nodes that themselves contain property lists. E.g. `CHARACTER`.
151    ///
152    /// The values `None` and `Some(vec![])` are semantically the same.
153    /// However they are displayed differently.
154    /// For example the stop node is displayed as `(STOP)` but an empty lig table node
155    ///     is displayed as `(LIGTABLE\n)`.
156    pub children: Option<Vec<Node>>,
157    /// Span of the closing parenthesis for this node in the source file.
158    ///
159    /// It either has length 1 (for a closing parenthesis that appears in the source file)
160    /// or 0 (for a closing parenthesis automatically added to balance an unbalanced opening parenthesis).
161    pub closing_parenthesis_span: std::ops::Range<usize>,
162}
163
164fn parse(s: &str) -> (Cst, Vec<ParseWarning>) {
165    let mut warnings = vec![];
166    let mut cst = Cst(vec![]);
167    let mut stack: Vec<RegularNode> = vec![];
168    let push = |cst: &mut Cst, stack: &mut Vec<RegularNode>, node: Node| match stack.last_mut() {
169        None => cst.0.push(node),
170        Some(tail) => tail.children.get_or_insert(vec![]).push(node),
171    };
172
173    let mut iter = ParseIter::new(s);
174    while let Some((pos, c)) = iter.peek() {
175        match c {
176            '(' => {
177                iter.next();
178                let (key, key_span) = iter.accumulate_key();
179
180                if key == "COMMENT" {
181                    // todo: trim whitespace from the front of the comment?
182                    let mut comment = String::new();
183                    let mut comment_stack: Vec<usize> = vec![pos];
184                    for (u, c) in iter.by_ref() {
185                        match c {
186                            '(' => {
187                                comment_stack.push(u);
188                            }
189                            ')' => {
190                                comment_stack.pop();
191                                if comment_stack.is_empty() {
192                                    break;
193                                }
194                            }
195                            _ => {}
196                        }
197                        comment.push(c);
198                    }
199                    // If there are unbalanced opening parentheses in a comment, only 2 warnings
200                    // are printed irrespective of the depth. The first warning is for the opening
201                    // parenthesis starting the comment, and the second warning is for the opening
202                    // parenthesis that increases the level to 1.
203                    while comment_stack.len() > 2 {
204                        comment.push(')');
205                        comment_stack.pop();
206                    }
207                    if comment_stack.len() == 2 {
208                        comment.push(')');
209                    }
210                    while let Some(u) = comment_stack.pop() {
211                        let pos = iter.next_pos;
212                        warnings.push(ParseWarning {
213                            span: pos..pos,
214                            knuth_pltotf_offset: Some(pos),
215                            kind: ParseWarningKind::UnbalancedOpeningParenthesis {
216                                opening_parenthesis_span: u..u + 1,
217                            },
218                        });
219                    }
220                    push(&mut cst, &mut stack, Node::Comment(comment));
221                } else {
222                    iter.trim_whitespace();
223                    let (data, data_span) = iter.accumulate_string();
224                    let value = RegularNode {
225                        opening_parenthesis_span: pos..pos + 1,
226                        key,
227                        key_span,
228                        data: Some(data),
229                        data_span,
230                        children: Some(vec![]),
231                        // the correct value is figured out later
232                        closing_parenthesis_span: 0..0,
233                    };
234                    stack.push(value);
235                }
236            }
237            ')' => {
238                iter.next();
239                match stack.pop() {
240                    None => warnings.push(ParseWarning {
241                        span: pos..pos + 1,
242                        knuth_pltotf_offset: Some(pos),
243                        kind: ParseWarningKind::UnexpectedClosingParenthesis,
244                    }),
245                    Some(mut finished) => {
246                        finished.closing_parenthesis_span = pos..pos + 1;
247                        push(&mut cst, &mut stack, Node::Regular(finished));
248                    }
249                }
250            }
251            ' ' | '\n' => {
252                iter.next();
253                continue;
254            }
255            _ => {
256                let (junk, span) = iter.accumulate_string();
257                warnings.push(ParseWarning {
258                    span: span.clone(),
259                    knuth_pltotf_offset: Some(span.start + 1),
260                    kind: ParseWarningKind::JunkInsidePropertyList { junk },
261                })
262            }
263        }
264    }
265    while let Some(mut finished) = stack.pop() {
266        let pos = iter.next_pos;
267        warnings.push(ParseWarning {
268            span: pos..pos,
269            knuth_pltotf_offset: Some(pos),
270            kind: ParseWarningKind::UnbalancedOpeningParenthesis {
271                opening_parenthesis_span: finished.opening_parenthesis_span.clone(),
272            },
273        });
274        finished.closing_parenthesis_span = pos..pos;
275        push(&mut cst, &mut stack, Node::Regular(finished));
276    }
277    warnings.extend(iter.warnings);
278    (cst, warnings)
279}
280
281struct ParseIter<'a> {
282    iter: std::iter::Peekable<super::Chars<'a>>,
283    warnings: Vec<ParseWarning>,
284    next_pos: usize,
285}
286
287impl<'a> Iterator for ParseIter<'a> {
288    type Item = (usize, char);
289
290    fn next(&mut self) -> Option<Self::Item> {
291        self.iter.next().map(|c| {
292            let pos = self.next_pos;
293            self.next_pos += 1;
294            (pos, c)
295        })
296    }
297}
298
299impl<'a> ParseIter<'a> {
300    fn peek(&mut self) -> Option<<Self as Iterator>::Item> {
301        self.iter.peek().copied().map(|c| (self.next_pos, c))
302    }
303    fn new(s: &'a str) -> Self {
304        Self {
305            iter: super::Chars::new(s).peekable(),
306            warnings: vec![],
307            next_pos: 0,
308        }
309    }
310    fn trim_whitespace(&mut self) {
311        while let Some((_, c)) = self.peek() {
312            match c {
313                ' ' | '\n' => {
314                    self.next();
315                }
316                _ => break,
317            }
318        }
319    }
320    fn accumulate_key(&mut self) -> (String, std::ops::Range<usize>) {
321        self.accumulate_internal(|c| c.is_alphanumeric() || c == '/' || c == '>')
322    }
323    fn accumulate_string(&mut self) -> (String, std::ops::Range<usize>) {
324        self.accumulate_internal(|c| c != ')' && c != '(')
325    }
326    fn accumulate_internal<F: Fn(char) -> bool>(
327        &mut self,
328        is_allowed_char: F,
329    ) -> (String, std::ops::Range<usize>) {
330        let mut s = String::new();
331        let mut start: Option<usize> = None;
332        while let Some((pos, c)) = self.peek() {
333            if is_allowed_char(c) {
334                self.next();
335                s.push(c);
336                start.get_or_insert(pos);
337            } else {
338                break;
339            }
340        }
341        let start = start.unwrap_or(self.next_pos);
342        (s, start..self.next_pos)
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    fn run(source: &str, want: Vec<Node>, want_errors: Vec<ParseWarning>) {
351        let (got, got_errors) = Cst::from_pl_source_code(source);
352        assert_eq!(got, Cst(want));
353        assert_eq!(got_errors, want_errors);
354    }
355
356    macro_rules! cst_test {
357        ( $( ($name: ident, $input: expr, $want: expr, $want_errors: expr, ), )+ ) => {
358            $(
359                #[test]
360                fn $name() {
361                    let input = $input;
362                    let want = $want;
363                    let want_errors = $want_errors;
364                    run(input, want, want_errors);
365                }
366            )+
367        };
368    }
369
370    cst_test!(
371        (
372            basic,
373            " (Hello World (Nested One) (Nested  Two Two) (Nested Three\nThree))",
374            vec![Node::Regular(RegularNode {
375                opening_parenthesis_span: 1..2,
376                key: "Hello".into(),
377                key_span: 2..7,
378                data: Some("World ".into()),
379                data_span: 8..14,
380                children: vec![
381                    Node::Regular(RegularNode {
382                        opening_parenthesis_span: 14..15,
383                        key: "Nested".into(),
384                        key_span: 15..21,
385                        data: Some("One".into()),
386                        data_span: 22..25,
387                        children: vec![].into(),
388                        closing_parenthesis_span: 25..26,
389                    }),
390                    Node::Regular(RegularNode {
391                        opening_parenthesis_span: 27..28,
392                        key: "Nested".into(),
393                        key_span: 28..34,
394                        data: Some("Two Two".into()),
395                        data_span: 36..43,
396                        children: vec![].into(),
397                        closing_parenthesis_span: 43..44,
398                    }),
399                    Node::Regular(RegularNode {
400                        opening_parenthesis_span: 45..46,
401                        key: "Nested".into(),
402                        key_span: 46..52,
403                        data: Some("Three\nThree".into()),
404                        data_span: 53..64,
405                        children: vec![].into(),
406                        closing_parenthesis_span: 64..65,
407                    }),
408                ]
409                .into(),
410                closing_parenthesis_span: 65..66,
411            }),],
412            vec![],
413        ),
414        (
415            basic_never_ends,
416            "(Hello",
417            vec![Node::Regular(RegularNode {
418                opening_parenthesis_span: 0..1,
419                key: "Hello".into(),
420                key_span: 1..6,
421                data: Some("".into()),
422                data_span: 6..6,
423                children: vec![].into(),
424                closing_parenthesis_span: 6..6,
425            }),],
426            vec![ParseWarning {
427                span: 6..6,
428                knuth_pltotf_offset: Some(6),
429                kind: ParseWarningKind::UnbalancedOpeningParenthesis {
430                    opening_parenthesis_span: 0..1,
431                },
432            }],
433        ),
434        (
435            basic_never_ends_2,
436            "(",
437            vec![Node::Regular(RegularNode {
438                opening_parenthesis_span: 0..1,
439                key: "".into(),
440                key_span: 1..1,
441                data: Some("".into()),
442                data_span: 1..1,
443                children: vec![].into(),
444                closing_parenthesis_span: 1..1,
445            }),],
446            vec![ParseWarning {
447                span: 1..1,
448                knuth_pltotf_offset: Some(1),
449                kind: ParseWarningKind::UnbalancedOpeningParenthesis {
450                    opening_parenthesis_span: 0..1,
451                },
452            }],
453        ),
454        (
455            empty_node,
456            "()",
457            vec![Node::Regular(RegularNode {
458                opening_parenthesis_span: 0..1,
459                key: "".into(),
460                key_span: 1..1,
461                data: Some("".into()),
462                data_span: 1..1,
463                children: vec![].into(),
464                closing_parenthesis_span: 1..2,
465            }),],
466            vec![],
467        ),
468        (
469            crlf,
470            "(CRLF\r\nGOOD\r\r\nVALUE\r\n)",
471            vec![Node::Regular(RegularNode {
472                opening_parenthesis_span: 0..1,
473                key: "CRLF".into(),
474                key_span: 1..5,
475                data: Some("GOOD\nVALUE\n".into()),
476                data_span: 6..17,
477                children: vec![].into(),
478                closing_parenthesis_span: 17..18,
479            }),],
480            vec![],
481        ),
482        (
483            garbage_string_in_list,
484            "(Hello World) Garbage String (Hola Mundo)",
485            vec![
486                Node::Regular(RegularNode {
487                    opening_parenthesis_span: 0..1,
488                    key: "Hello".into(),
489                    key_span: 1..6,
490                    data: Some("World".into()),
491                    data_span: 7..12,
492                    children: vec![].into(),
493                    closing_parenthesis_span: 12..13,
494                }),
495                Node::Regular(RegularNode {
496                    opening_parenthesis_span: 29..30,
497                    key: "Hola".into(),
498                    key_span: 30..34,
499                    data: Some("Mundo".into()),
500                    data_span: 35..40,
501                    children: vec![].into(),
502                    closing_parenthesis_span: 40..41,
503                }),
504            ],
505            vec![ParseWarning {
506                span: 14..29,
507                knuth_pltotf_offset: Some(15),
508                kind: ParseWarningKind::JunkInsidePropertyList {
509                    junk: "Garbage String ".into(),
510                },
511            }],
512        ),
513        (
514            garbage_closed_parenthesis_in_list,
515            "(Hello World) ) (Hola Mundo)",
516            vec![
517                Node::Regular(RegularNode {
518                    opening_parenthesis_span: 0..1,
519                    key: "Hello".into(),
520                    key_span: 1..6,
521                    data: Some("World".into()),
522                    data_span: 7..12,
523                    children: vec![].into(),
524                    closing_parenthesis_span: 12..13,
525                }),
526                Node::Regular(RegularNode {
527                    opening_parenthesis_span: 16..17,
528                    key: "Hola".into(),
529                    key_span: 17..21,
530                    data: Some("Mundo".into()),
531                    data_span: 22..27,
532                    children: vec![].into(),
533                    closing_parenthesis_span: 27..28,
534                }),
535            ],
536            vec![ParseWarning {
537                span: 14..15,
538                knuth_pltotf_offset: Some(14),
539                kind: ParseWarningKind::UnexpectedClosingParenthesis,
540            }],
541        ),
542        (
543            comment,
544            "(COMMENT World (Nested One) Hello (Nested Two))",
545            vec![Node::Comment(
546                " World (Nested One) Hello (Nested Two)".into()
547            )],
548            vec![],
549        ),
550        (
551            comment_with_newlines,
552            "(COMMENT World (Nested\nOne) Hello (Nested\nTwo))",
553            vec![Node::Comment(
554                " World (Nested\nOne) Hello (Nested\nTwo)".into()
555            ),],
556            vec![],
557        ),
558        (
559            comment_trailing_space,
560            "(COMMENT World )",
561            vec![Node::Comment(" World ".into()),],
562            vec![],
563        ),
564        (
565            comment_never_ends,
566            "(COMMENT World (",
567            vec![Node::Comment(" World ()".into())],
568            vec![
569                ParseWarning {
570                    span: 16..16,
571                    knuth_pltotf_offset: Some(16),
572                    kind: ParseWarningKind::UnbalancedOpeningParenthesis {
573                        opening_parenthesis_span: 15..16,
574                    }
575                },
576                ParseWarning {
577                    span: 16..16,
578                    knuth_pltotf_offset: Some(16),
579                    kind: ParseWarningKind::UnbalancedOpeningParenthesis {
580                        opening_parenthesis_span: 0..1,
581                    }
582                },
583            ],
584        ),
585        (
586            non_visible_ascii_char,
587            "(Hello Worldä)",
588            vec![Node::Regular(RegularNode {
589                opening_parenthesis_span: 0..1,
590                key: "Hello".into(),
591                key_span: 1..6,
592                data: Some("Worldä".into()),
593                data_span: 7..13,
594                children: vec![].into(),
595                closing_parenthesis_span: 13..14,
596            }),],
597            vec![],
598        ),
599        (
600            trailing_space,
601            "(Hello World )",
602            vec![Node::Regular(RegularNode {
603                opening_parenthesis_span: 0..1,
604                key: "Hello".into(),
605                key_span: 1..6,
606                data: Some("World ".into()),
607                data_span: 7..13,
608                children: vec![].into(),
609                closing_parenthesis_span: 13..14,
610            }),],
611            vec![],
612        ),
613    );
614}