texlang/token/
trace.rs

1//! Token tracing system
2//!
3//! This module implements a [Tracer] that determines the origins of tokens.
4//! When building helpful error messages we need to know the token came from -
5//!     e.g., the file and line number.
6//! The tracing functionality here enables obtaining this information in the form of a [SourceCodeTrace].
7//!
8//! Rather than the custom system here,
9//!     a simpler solution would be to include this information on the token itself.
10//! The token could include a line position number and a reference
11//!     counting pointer to some data structure containing information about the line.
12//! The problem with this solution is that it makes the [Token] type very large, and
13//!     this causes unacceptably poor performance in Texlang's tight inner loops.
14//! With the tracer here, each token only needs to hold onto a 32-bit [Key] which is enough
15//!     to perform a full trace.
16//!
17//! # How the tracer works
18//!
19//! When adding source code to the input, the tracer is informed using the
20//!     [register_source_code](Tracer::register_source_code) method.
21//! The tracer allocates a contiguous range of [Keys](Key) that is large enough
22//!     to give each UTF-8 character in the input a unique key.
23//! These keys are returned using the opaque [KeyRange] type, which enables the caller to retrieve
24//!     these keys.
25//! It is assumed that the caller will assign keys in order to each UTF-8 character in the source code.
26//!
27//! In addition to returning the range, the tracer associates the key range with the source code in an
28//!     internal data structure.
29//!
30//! When tracing a token (using [trace](Tracer::trace)), the token's key is used to identify
31//!     which key range the key came from.
32//! This key range is then used to identify the source code associated to the token.
33//! The difference between the token's key and the first key for the source code is the UTF-8 offset
34//!     into the source code.
35//! Thus we can uniquely identify the UTF-8 character the token is a associated to.
36use crate::token::{CsNameInterner, Token, Value};
37use std::collections::BTreeMap;
38use std::ops::Bound::Included;
39use std::path::PathBuf;
40
41use super::CommandRef;
42
43/// Key attached to tokens to enable tracing them.
44///
45/// This type is 32 bits.
46#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub struct Key(u32);
49
50impl Key {
51    #[cfg(test)]
52    pub(crate) fn dummy() -> Key {
53        Key(u32::MAX)
54    }
55
56    #[cfg(test)]
57    pub fn as_u32(&self) -> u32 {
58        self.0
59    }
60}
61
62/// Range of free keys that may be assigned to tokens.
63#[derive(Debug)]
64#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
65pub struct KeyRange {
66    next: u32,
67    limit: u32,
68}
69
70impl KeyRange {
71    /// Get the next trace [Key].
72    ///
73    /// Panics if all of the keys in this range have been used.
74    #[allow(clippy::should_implement_trait)]
75    pub fn next(&mut self) -> Key {
76        if self.next >= self.limit {
77            panic!["requested more trace keys than are in the range"]
78        }
79        let n = self.next;
80        self.next += 1;
81        Key(n)
82    }
83
84    /// Peek at the next trace [Key].
85    ///
86    /// Panics if all of the keys in this range have been used.
87    pub fn peek(&mut self) -> Key {
88        if self.next >= self.limit {
89            panic!["requested more trace keys than are in the range"]
90        }
91        Key(self.next)
92    }
93
94    /// Advances the range forward by the provided offset.
95    ///
96    /// Panics if the provided offset cannot be cast to u32.
97    pub fn advance_by(&mut self, u: usize) {
98        self.next = self.next.checked_add(u.try_into().unwrap()).unwrap();
99    }
100}
101
102impl KeyRange {
103    pub fn empty() -> KeyRange {
104        KeyRange { next: 0, limit: 0 }
105    }
106
107    #[cfg(test)]
108    pub fn for_testing() -> KeyRange {
109        KeyRange {
110            next: 0,
111            limit: u32::MAX,
112        }
113    }
114}
115
116/// A token trace
117#[derive(Debug, PartialEq, Eq, Clone)]
118#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
119pub struct SourceCodeTrace {
120    /// Origin of the source code this token came from.
121    pub origin: Origin,
122    /// Content of the line this token came from.
123    pub line_content: String,
124    /// Number of the line within the file, starting at 1.
125    pub line_number: usize,
126    /// Index within the line that the token starts.
127    pub index: usize,
128    /// Value of the token.
129    pub value: String,
130    /// If this is for a token, the value of the token.
131    /// Otherwise this is an end of input snippet.
132    pub token: Option<Token>,
133}
134
135/// Data structure that records information for token tracing
136#[derive(Default)]
137#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
138pub struct Tracer {
139    checkpoints: BTreeMap<u32, Checkpoint>,
140    next_key: u32,
141
142    // A key use to get the last file that was inputted manually; i.e., not via an \input
143    // or other command in a TeX file
144    last_external_input: Option<u32>,
145}
146
147/// Enum describing the possible origins of source code
148#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
149#[derive(Debug, PartialEq, Eq, Clone)]
150pub enum Origin {
151    File(PathBuf),
152    Terminal,
153}
154
155impl Tracer {
156    /// Registers source code with the tracer.
157    ///
158    /// The returned [KeyRange] should be used to assign [Keys](Key) to the tokens that
159    ///     are lexed from the referenced source code.
160    /// The tracer assumes that the first key is assigned to token corresponding to the
161    ///     first UTF-8 character in their source code,
162    ///     the second key to the second UTF-8 character, and so on.
163    pub fn register_source_code(
164        &mut self,
165        token: Option<Token>,
166        origin: Origin,
167        source_code: &str,
168    ) -> KeyRange {
169        let len = match u32::try_from(source_code.len()) {
170            Err(_) => {
171                panic!(
172                    "source code too big ({} bytes); max is 2^32={} bytes",
173                    source_code.len(),
174                    u32::MAX
175                )
176            }
177            // For empty files, we still assign one key because this allows us to trace end of input errors.
178            Ok(0) => 1_u32,
179            // We add 1 to handle the case when the file does not end in a newline character. In this case
180            // the extra key will be used to refer to a character added using the \endlinechar mechanism.
181            Ok(limit) => limit + 1,
182        };
183        let range = KeyRange {
184            next: self.next_key,
185            limit: self.next_key + len,
186        };
187        self.checkpoints.insert(
188            range.next,
189            Checkpoint::SourceCode {
190                origin,
191                content: source_code.to_string(),
192            },
193        );
194        if token.is_none() {
195            self.last_external_input = Some(self.next_key);
196        }
197        self.next_key = range.limit;
198        range
199    }
200
201    /// Return a trace for the provided token.
202    pub fn trace(&self, token: Token, cs_name_interner: &CsNameInterner) -> SourceCodeTrace {
203        let value = match token.value() {
204            Value::CommandRef(CommandRef::ControlSequence(cs_name)) => {
205                format!["\\{}", cs_name_interner.resolve(cs_name).unwrap()]
206            }
207            // TODO: maybe have a cs or char method?
208            _ => token.char().unwrap().to_string(),
209        };
210
211        let (&first_key, checkpoint) = self
212            .checkpoints
213            .range((Included(&0), Included(&token.trace_key.0)))
214            .next_back()
215            .unwrap();
216
217        match checkpoint {
218            Checkpoint::SourceCode { origin, content } => {
219                let char_offset = (token.trace_key().0 - first_key) as usize;
220                let mut line_number = 1;
221                let mut byte_line_start = 0;
222                let mut char_line_start = 0;
223                for (char_index, (byte_index, c)) in content.char_indices().enumerate() {
224                    if char_index == char_offset {
225                        break;
226                    }
227                    if c == '\n' {
228                        byte_line_start = byte_index + 1;
229                        char_line_start = char_index + 1;
230                        line_number += 1;
231                    }
232                }
233                let position = char_offset - char_line_start;
234                let tail = &content[byte_line_start..];
235                let line_content = match tail.split_once('\n') {
236                    None => tail.to_string(),
237                    Some((a, _)) => a.to_string(),
238                };
239                SourceCodeTrace {
240                    origin: origin.clone(),
241                    line_content,
242                    line_number,
243                    index: position,
244                    value,
245                    token: Some(token),
246                }
247            }
248        }
249    }
250
251    pub fn trace_end_of_input(&self) -> SourceCodeTrace {
252        let f = self
253            .checkpoints
254            .get(&self.last_external_input.unwrap())
255            .unwrap();
256        match f {
257            Checkpoint::SourceCode { origin, content } => {
258                // (line index, byte index of first character)
259                let mut last_line: (usize, usize) = (0, 0);
260                let mut last_non_empty_line: (usize, usize) = (0, 0);
261                for (i, c) in content.char_indices() {
262                    if !c.is_whitespace() {
263                        last_non_empty_line = last_line;
264                    } else if c == '\n' {
265                        last_line.0 += 1;
266                        last_line.1 = i + 1;
267                    }
268                }
269                let last_line = content[last_non_empty_line.1..].trim_end();
270                SourceCodeTrace {
271                    origin: origin.clone(),
272                    line_content: last_line.to_string(),
273                    line_number: last_non_empty_line.0 + 1,
274                    index: last_line.len(),
275                    value: " ".to_string(),
276                    token: None,
277                }
278            }
279        }
280    }
281}
282
283#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
284enum Checkpoint {
285    SourceCode {
286        origin: Origin,
287        content: String, // TODO: should be rc::Rc<str>?
288    },
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294
295    #[test]
296    fn one_source_code() {
297        let file_name: PathBuf = "input.tex".into();
298        let origin = Origin::File(file_name);
299        let line_1 = "hël".to_string();
300        let line_2 = "wor\\cömmand".to_string();
301        let line_3 = "hël".to_string();
302        let source_code = format!("{}\n{}\n{}", line_1, line_2, line_3);
303
304        let mut tracer: Tracer = Default::default();
305        let mut interner: CsNameInterner = Default::default();
306        let command = interner.get_or_intern("command");
307        let mut range = tracer.register_source_code(None, origin.clone(), &source_code);
308        let mut tokens = vec![
309            Token::new_letter('h', range.next()),
310            Token::new_letter('e', range.next()),
311            Token::new_letter('l', range.next()),
312            Token::new_space('\n', range.next()),
313            Token::new_letter('w', range.next()),
314            Token::new_letter('o', range.next()),
315            Token::new_letter('r', range.next()),
316            Token::new_control_sequence(command, range.next()),
317        ];
318        for _ in 0.."command".len() {
319            range.next();
320        }
321        let mut extra_tokens = vec![
322            Token::new_space('\n', range.next()),
323            Token::new_letter('h', range.next()),
324            Token::new_letter('e', range.next()),
325            Token::new_letter('l', range.next()),
326        ];
327        tokens.append(&mut extra_tokens);
328
329        let got_traces: Vec<SourceCodeTrace> = tokens
330            .iter()
331            .map(|token| tracer.trace(*token, &interner))
332            .collect();
333
334        let want_traces = vec![
335            SourceCodeTrace {
336                origin: origin.clone(),
337                line_content: line_1.clone(),
338                line_number: 1,
339                index: 0,
340                value: "h".to_string(),
341                token: Some(tokens[0]),
342            },
343            SourceCodeTrace {
344                origin: origin.clone(),
345                line_content: line_1.clone(),
346                line_number: 1,
347                index: 1,
348                value: "e".to_string(),
349                token: Some(tokens[1]),
350            },
351            SourceCodeTrace {
352                origin: origin.clone(),
353                line_content: line_1.clone(),
354                line_number: 1,
355                index: 2,
356                value: "l".to_string(),
357                token: Some(tokens[2]),
358            },
359            SourceCodeTrace {
360                origin: origin.clone(),
361                line_content: line_1.clone(),
362                line_number: 1,
363                index: 3,
364                value: "\n".to_string(),
365                token: Some(tokens[3]),
366            },
367            SourceCodeTrace {
368                origin: origin.clone(),
369                line_content: line_2.clone(),
370                line_number: 2,
371                index: 0,
372                value: "w".to_string(),
373                token: Some(tokens[4]),
374            },
375            SourceCodeTrace {
376                origin: origin.clone(),
377                line_content: line_2.clone(),
378                line_number: 2,
379                index: 1,
380                value: "o".to_string(),
381                token: Some(tokens[5]),
382            },
383            SourceCodeTrace {
384                origin: origin.clone(),
385                line_content: line_2.clone(),
386                line_number: 2,
387                index: 2,
388                value: "r".to_string(),
389                token: Some(tokens[6]),
390            },
391            SourceCodeTrace {
392                origin: origin.clone(),
393                line_content: line_2.clone(),
394                line_number: 2,
395                index: 3,
396                value: "\\command".to_string(),
397                token: Some(tokens[7]),
398            },
399            SourceCodeTrace {
400                origin: origin.clone(),
401                line_content: line_2.clone(),
402                line_number: 2,
403                index: 11,
404                value: "\n".to_string(),
405                token: Some(tokens[8]),
406            },
407            SourceCodeTrace {
408                origin: origin.clone(),
409                line_content: line_3.clone(),
410                line_number: 3,
411                index: 0,
412                value: "h".to_string(),
413                token: Some(tokens[9]),
414            },
415            SourceCodeTrace {
416                origin: origin.clone(),
417                line_content: line_3.clone(),
418                line_number: 3,
419                index: 1,
420                value: "e".to_string(),
421                token: Some(tokens[10]),
422            },
423            SourceCodeTrace {
424                origin: origin.clone(),
425                line_content: line_3.clone(),
426                line_number: 3,
427                index: 2,
428                value: "l".to_string(),
429                token: Some(tokens[11]),
430            },
431        ];
432        assert_eq!(want_traces, got_traces);
433    }
434
435    #[test]
436    fn multiple_source_code() {
437        let mut tokens = Vec::new();
438        let mut tracer: Tracer = Default::default();
439        let interner: CsNameInterner = Default::default();
440
441        let file_1 = Origin::File("a.tex".into());
442        let file_1_content = "a".to_string();
443        let mut range = tracer.register_source_code(None, file_1.clone(), &file_1_content);
444        tokens.push(Token::new_letter('a', range.next()));
445
446        let file_2 = Origin::File("b.tex".into());
447        let file_2_content = "b".to_string();
448        let mut range = tracer.register_source_code(None, file_2.clone(), &file_2_content);
449        tokens.push(Token::new_letter('b', range.next()));
450
451        let file_3 = Origin::Terminal;
452        let file_3_content = "c".to_string();
453        let mut range = tracer.register_source_code(None, file_3.clone(), &file_3_content);
454        tokens.push(Token::new_letter('c', range.next()));
455
456        let got_traces: Vec<SourceCodeTrace> = tokens
457            .iter()
458            .map(|token| tracer.trace(*token, &interner))
459            .collect();
460
461        let want_traces = vec![
462            SourceCodeTrace {
463                origin: file_1,
464                line_content: file_1_content,
465                line_number: 1,
466                index: 0,
467                value: "a".to_string(),
468                token: Some(tokens[0]),
469            },
470            SourceCodeTrace {
471                origin: file_2,
472                line_content: file_2_content,
473                line_number: 1,
474                index: 0,
475                value: "b".to_string(),
476                token: Some(tokens[1]),
477            },
478            SourceCodeTrace {
479                origin: file_3,
480                line_content: file_3_content,
481                line_number: 1,
482                index: 0,
483                value: "c".to_string(),
484                token: Some(tokens[2]),
485            },
486        ];
487        assert_eq!(want_traces, got_traces);
488    }
489}