1use crate::error;
93use crate::token;
94use crate::token::trace;
95use crate::token::CsNameInterner;
96use crate::token::Token;
97use crate::types::CatCode;
98use crate::vm;
99
100#[derive(Debug)]
102pub struct InvalidCharacterError {
103 pub char: char,
105 pub trace: trace::SourceCodeTrace,
107}
108
109impl InvalidCharacterError {
110 pub fn new<S: vm::TexlangState>(vm: &vm::VM<S>, char: char, trace_key: trace::Key) -> Self {
112 Self {
113 char,
114 trace: vm.trace(Token::new_letter(char, trace_key)),
115 }
116 }
117}
118
119impl error::TexError for InvalidCharacterError {
120 fn kind(&self) -> error::Kind {
121 error::Kind::FailedPrecondition
122 }
125
126 fn title(&self) -> String {
127 format![
128 "input contains a character {} (Unicode code point {}) with category code {}",
129 self.char,
130 self.char as u32,
131 CatCode::Invalid
132 ]
133 }
134
135 fn source_annotation(&self) -> String {
136 "invalid character".into()
137 }
138
139 fn notes(&self) -> Vec<error::display::Note> {
140 vec![format![
141 "characters with category code {} cannot appear in the input",
142 CatCode::Invalid
143 ]
144 .into()]
145 }
146 fn source_code_trace_override(&self) -> Option<&trace::SourceCodeTrace> {
147 Some(&self.trace)
148 }
149}
150
151pub trait Config {
156 fn cat_code(&self, c: char) -> CatCode;
158
159 fn end_line_char(&self) -> Option<char>;
161}
162
163pub enum Result {
165 Token(token::Token),
167 InvalidCharacter(char, trace::Key),
169 EndOfLine,
173 EndOfInput,
175}
176
177#[derive(Debug)]
178#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
179enum State {
180 NewLine,
181 MidLine,
182 SkipBlanks,
183}
184
185#[derive(Debug)]
187#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
188pub struct Lexer {
189 raw_lexer: RawLexer,
190 state: State,
191 first_line_started: bool,
192 #[cfg_attr(feature = "serde", serde(skip))]
194 buffer: String,
195}
196
197impl Lexer {
198 pub fn new(source_code: String, trace_key_range: trace::KeyRange) -> Lexer {
200 Lexer {
201 raw_lexer: RawLexer::new(source_code, trace_key_range),
202 state: State::NewLine,
203 first_line_started: false,
204 buffer: Default::default(),
205 }
206 }
207
208 pub fn next<C: Config>(
210 &mut self,
211 config: &C,
212 cs_name_interner: &mut CsNameInterner,
213 report_end_of_line: bool,
214 ) -> Result {
215 loop {
216 let raw_token = match self.raw_lexer.next(config) {
217 None => {
218 self.state = State::NewLine;
219 if !self.raw_lexer.start_new_line(config) {
220 return Result::EndOfInput;
221 }
222 if report_end_of_line {
223 if self.first_line_started {
224 return Result::EndOfLine;
225 }
226 self.first_line_started = true;
227 }
228 continue;
229 }
230 Some(raw_token) => raw_token,
231 };
232 let c = raw_token.char;
233 let (value, next_state) = match raw_token.code {
234 CatCode::Escape => {
235 let (cs_name, new_state) = self.read_control_sequence(config, cs_name_interner);
236 (
237 Token::new_control_sequence(cs_name, raw_token.trace_key),
238 new_state,
239 )
240 }
241 CatCode::EndOfLine => {
242 self.raw_lexer.end_line();
243 match self.state {
244 State::NewLine => (
245 Token::new_control_sequence(
246 cs_name_interner.get_or_intern("par"),
247 raw_token.trace_key,
248 ),
249 State::NewLine,
250 ),
251 State::MidLine => {
252 (Token::new_space(' ', raw_token.trace_key), State::NewLine)
253 }
254 State::SkipBlanks => {
255 continue;
256 }
257 }
258 }
259 CatCode::Space => match self.state {
260 State::NewLine | State::SkipBlanks => continue,
261 State::MidLine => (
262 Token::new_space(' ', raw_token.trace_key),
263 State::SkipBlanks,
264 ),
265 },
266 CatCode::BeginGroup => (
267 Token::new_begin_group(c, raw_token.trace_key),
268 State::MidLine,
269 ),
270 CatCode::EndGroup => (Token::new_end_group(c, raw_token.trace_key), State::MidLine),
271 CatCode::MathShift => (
272 Token::new_math_shift(c, raw_token.trace_key),
273 State::MidLine,
274 ),
275 CatCode::AlignmentTab => (
276 Token::new_alignment_tab(c, raw_token.trace_key),
277 State::MidLine,
278 ),
279 CatCode::Parameter => {
280 (Token::new_parameter(c, raw_token.trace_key), State::MidLine)
281 }
282 CatCode::Superscript => {
283 if self
284 .raw_lexer
285 .maybe_apply_caret_notation(raw_token.char, true)
286 {
287 continue;
288 }
289 (
290 Token::new_superscript(c, raw_token.trace_key),
291 State::MidLine,
292 )
293 }
294 CatCode::Subscript => {
295 (Token::new_subscript(c, raw_token.trace_key), State::MidLine)
296 }
297 CatCode::Letter => (Token::new_letter(c, raw_token.trace_key), State::MidLine),
298 CatCode::Other => (Token::new_other(c, raw_token.trace_key), State::MidLine),
299 CatCode::Active => (
300 Token::new_active_character(c, raw_token.trace_key),
301 State::MidLine,
302 ),
303 CatCode::Comment => {
304 self.raw_lexer.end_line();
305 continue;
306 }
307 CatCode::Ignored => {
308 continue;
309 }
310 CatCode::Invalid => {
311 return Result::InvalidCharacter(raw_token.char, raw_token.trace_key)
312 }
313 };
314 self.state = next_state;
315 return Result::Token(value);
316 }
317 }
318
319 pub fn end(&mut self) {
323 self.raw_lexer.end()
324 }
325
326 fn read_control_sequence<F: Config>(
327 &mut self,
328 config: &F,
329 cs_name_interner: &mut CsNameInterner,
330 ) -> (token::CsName, State) {
331 self.buffer.clear();
332 let first_raw_token = match self.raw_lexer.next(config) {
333 None => return (cs_name_interner.get_or_intern(""), State::NewLine),
334 Some(first_raw_token) => first_raw_token,
335 };
336 match first_raw_token.code {
337 CatCode::Letter => {
338 self.buffer.push(first_raw_token.char);
339 while let Some(raw_token) = self.raw_lexer.peek(config) {
340 match raw_token.code {
341 CatCode::Letter => {
342 self.raw_lexer.advance();
343 self.buffer.push(raw_token.char);
344 }
345 CatCode::Superscript => {
346 if self
347 .raw_lexer
348 .maybe_apply_caret_notation(raw_token.char, false)
349 {
350 continue;
351 }
352 break;
353 }
354 _ => break,
355 }
356 }
357 }
358 CatCode::Superscript => {
359 if self
360 .raw_lexer
361 .maybe_apply_caret_notation(first_raw_token.char, true)
362 {
363 return self.read_control_sequence(config, cs_name_interner);
364 }
365 self.buffer.push(first_raw_token.char);
366 }
367 _ => {
368 self.buffer.push(first_raw_token.char);
369 }
370 };
371 let new_state = match first_raw_token.code {
372 CatCode::Letter | CatCode::Space => State::SkipBlanks,
373 _ => State::MidLine,
374 };
375 (cs_name_interner.get_or_intern(&self.buffer), new_state)
376 }
377}
378
379#[derive(Debug)]
380struct RawToken {
381 code: CatCode,
382 char: char,
383 trace_key: trace::Key,
384}
385
386#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
387#[derive(Debug)]
388struct RawLexer {
389 source_code: String,
391 current_line: String,
392 pos: usize,
393
394 next_line: usize,
395 num_trimmed_right: usize,
399 trace_key_range: trace::KeyRange,
400}
401
402impl RawLexer {
403 fn new(source_code: String, trace_key_range: trace::KeyRange) -> RawLexer {
404 RawLexer {
405 source_code,
406 current_line: "".into(),
407 pos: 0,
408 next_line: 0,
409 num_trimmed_right: 0,
410 trace_key_range,
411 }
412 }
413
414 fn end_line(&mut self) {
415 let num_skipped_chars: usize = self.current_line[self.pos..].chars().count();
416 self.trace_key_range.advance_by(num_skipped_chars);
417 self.pos = self.current_line.len();
418 }
419
420 fn start_new_line<C: Config>(&mut self, config: &C) -> bool {
422 let num_skipped_chars: usize = self.current_line[self.pos..].chars().count();
423 self.trace_key_range.advance_by(num_skipped_chars);
424 self.trace_key_range.advance_by(self.num_trimmed_right);
425 self.pos = 0;
426 self.current_line.clear();
427 if self.next_line >= self.source_code.len() {
428 return false;
429 }
430 let start = self.next_line;
431 let mut end = self.next_line;
432 let mut num_spaces = 0;
433 for c in self.source_code[self.next_line..].chars() {
434 if c == '\n' {
435 num_spaces += 1;
441 break;
442 }
443 if c != ' ' {
444 end += c.len_utf8();
445 end += num_spaces;
446 num_spaces = 0;
447 }
448 if c == ' ' {
449 num_spaces += 1;
450 }
451 }
452 self.num_trimmed_right = num_spaces;
453 self.next_line = end + num_spaces;
454 self.current_line.push_str(&self.source_code[start..end]);
455 if let Some(end_line_char) = config.end_line_char() {
456 self.num_trimmed_right = self.num_trimmed_right.saturating_sub(1);
459 self.current_line.push(end_line_char)
460 }
461 true
462 }
463
464 #[inline]
465 fn next<C: Config>(&mut self, config: &C) -> Option<RawToken> {
466 match self.next_char() {
467 Some(c) => {
468 self.pos += c.len_utf8();
469 Some(RawToken {
470 char: c,
471 code: config.cat_code(c),
472 trace_key: self.trace_key_range.next(),
473 })
474 }
475 None => None,
476 }
477 }
478
479 fn peek<C: Config>(&mut self, config: &C) -> Option<RawToken> {
480 match self.next_char() {
481 Some(c) => {
482 let code = config.cat_code(c);
483 Some(RawToken {
484 char: c,
485 code,
486 trace_key: self.trace_key_range.peek(),
487 })
488 }
489 None => None,
490 }
491 }
492
493 fn maybe_apply_caret_notation(&mut self, char_1: char, char_1_consumed: bool) -> bool {
494 let char_2_start = if char_1_consumed {
495 self.pos
496 } else {
497 self.pos + char_1.len_utf8()
498 };
499 let char_2 = match self.current_line[char_2_start..].chars().next() {
500 None => return false,
503 Some(next_char) => next_char,
504 };
505 if char_2 != char_1 {
506 return false;
507 }
508 let char_3_start = char_2_start + char_2.len_utf8();
509 let char_3 = match self.current_line[char_3_start..].chars().next() {
510 None => return false,
513 Some(c) => c,
514 };
515 if !char_1_consumed {
516 self.advance();
517 }
518 self.advance();
519 if !char_3.is_ascii() {
520 return true;
521 }
522 let u: u8 = match (char_3 as u32).try_into() {
523 Ok(u) => u,
524 Err(_) => return true, };
526 let m = match u {
527 0x00..=0x3F => u + 0x40,
528 0x40..=0x7F => u - 0x40,
529 _ => return true, };
531 assert!(char::from_u32(m as u32).unwrap().is_ascii());
534 unsafe {
538 self.current_line.as_bytes_mut()[self.pos] = m;
539 }
540 true
541 }
542
543 fn next_char(&self) -> Option<char> {
544 self.current_line[self.pos..].chars().next()
545 }
546
547 fn advance(&mut self) {
548 self.pos += self.current_line[self.pos..]
549 .chars()
550 .next()
551 .unwrap()
552 .len_utf8();
553 self.trace_key_range.next();
554 }
555
556 fn end(&mut self) {
557 self.pos = self.current_line.len();
559 self.next_line = self.source_code.len();
562 }
563}
564
565#[cfg(test)]
566mod tests {
567 use super::*;
568 use crate::token::CommandRef;
569 use crate::token::Value;
570 use crate::types::CatCode::*;
571 use std::collections::HashMap;
572
573 #[derive(Debug, PartialEq, Eq)]
574 enum TokenValue<'a> {
575 Character(char, CatCode, u32),
576 ControlSequence(&'a str, u32),
577 NewLine,
578 }
579 use TokenValue::*;
580
581 impl<'a> TokenValue<'a> {
582 fn new(token: Token, interner: &'a CsNameInterner) -> TokenValue<'a> {
583 let trace_key = token.trace_key().as_u32();
584 if let Value::CommandRef(CommandRef::ControlSequence(cs_name)) = token.value() {
585 TokenValue::ControlSequence(interner.resolve(cs_name).unwrap(), trace_key)
586 } else {
587 TokenValue::Character(token.char().unwrap(), token.cat_code().unwrap(), trace_key)
588 }
589 }
590 }
591
592 struct TestConfig {
593 cat_codes: HashMap<char, CatCode>,
594 end_line_char: Option<char>,
595 }
596
597 impl Config for TestConfig {
598 fn cat_code(&self, c: char) -> CatCode {
599 self.cat_codes.get(&c).copied().unwrap_or_default()
600 }
601 fn end_line_char(&self) -> Option<char> {
602 self.end_line_char
603 }
604 }
605
606 fn lexer_test(
607 input: &str,
608 expected_tokens: Vec<TokenValue>,
609 end_line_char: Option<char>,
610 cat_code_overrides: Vec<(char, CatCode)>,
611 ) {
612 let mut cat_codes: HashMap<char, CatCode> = CatCode::PLAIN_TEX_DEFAULTS
613 .iter()
614 .enumerate()
615 .map(|(a, b)| (char::from_u32(a.try_into().unwrap()).unwrap(), *b))
616 .collect();
617 for (c, cat_code) in cat_code_overrides {
618 cat_codes.insert(c, cat_code);
619 }
620 let config = TestConfig {
621 cat_codes,
622 end_line_char,
623 };
624 let mut cs_name_interner: CsNameInterner = Default::default();
625 let mut actual = Vec::new();
626 let mut lexer = Lexer::new(input.into(), trace::KeyRange::for_testing());
627 loop {
628 let next = lexer.next(&config, &mut cs_name_interner, true);
629 if let Result::EndOfInput = next {
630 break;
631 }
632 actual.push(next);
633 }
634 let actual: Vec<TokenValue<'_>> = actual
635 .into_iter()
636 .map(|t| match t {
637 Result::Token(t) => TokenValue::new(t, &cs_name_interner),
638 Result::InvalidCharacter(_, _) => panic!("invalid character"),
639 Result::EndOfLine => TokenValue::NewLine,
640 Result::EndOfInput => unreachable!(),
641 })
642 .collect();
643 assert_eq!(expected_tokens, actual);
644 }
645
646 macro_rules! lexer_tests {
647 (
648 end_line_char( $end_line_char: expr ),
649 cat_code_overrides $cat_code_overrides: tt,
650 $( ( $name: ident, $input: expr, $ ( $expected_token : expr, ) * ), )+
651 ) => {
652 $(
653 #[test]
654 fn $name() {
655 let end_line_char = $end_line_char;
656 let cat_code_overrides = vec! $cat_code_overrides;
657 let input = $input;
658 let expected_tokens = vec!( $( $expected_token ),* );
659 lexer_test(&input, expected_tokens, end_line_char, cat_code_overrides);
660 }
661 )+
662 };
663 }
664
665 lexer_tests![
666 end_line_char(Some('\r')),
667 cat_code_overrides(),
668 (empty_1, r"",),
669 (empty_2, "\n", ControlSequence("par", 0),),
670 (
671 control_sequence_basic_1,
672 r"\a{b}",
673 ControlSequence("a", 0),
674 Character('{', BeginGroup, 2),
675 Character('b', Letter, 3),
676 Character('}', EndGroup, 4),
677 Character(' ', Space, 5),
678 ),
679 (
680 control_sequence_basic_2,
681 r"\A1",
682 ControlSequence("A", 0),
683 Character('1', Other, 2),
684 Character(' ', Space, 3),
685 ),
686 (
687 control_sequence_single_letter_trailing_space_1,
688 r"\a b",
689 ControlSequence("a", 0),
690 Character('b', Letter, 3),
691 Character(' ', Space, 4),
692 ),
693 (
694 control_sequence_single_letter_trailing_space_2,
695 r"\a b",
696 ControlSequence("a", 0),
697 Character('b', Letter, 4),
698 Character(' ', Space, 5),
699 ),
700 (
701 control_sequence_single_letter_trailing_newline_1,
702 "\\a\n b",
703 ControlSequence("a", 0),
704 NewLine,
705 Character('b', Letter, 4),
706 Character(' ', Space, 5),
707 ),
708 (
709 control_sequence_single_letter_trailing_newline_2,
710 "\\a\n\nb",
711 ControlSequence("a", 0),
712 NewLine,
713 ControlSequence("par", 3),
714 NewLine,
715 Character('b', Letter, 4),
716 Character(' ', Space, 5),
717 ),
718 (
719 control_sequence_multi_letter_1,
720 "\\ABC{D}",
721 ControlSequence("ABC", 0),
722 Character('{', BeginGroup, 4),
723 Character('D', Letter, 5),
724 Character('}', EndGroup, 6),
725 Character(' ', Space, 7),
726 ),
727 (
728 control_sequence_multi_letter_2,
729 "\\ABC",
730 ControlSequence("ABC", 0),
731 ),
732 (
733 control_sequence_single_other_1,
734 "\\{{",
735 ControlSequence("{", 0),
736 Character('{', BeginGroup, 2),
737 Character(' ', Space, 3),
738 ),
739 (
740 control_sequence_single_other_2,
741 "\\+A",
742 ControlSequence("+", 0),
743 Character('A', Letter, 2),
744 Character(' ', Space, 3),
745 ),
746 (
747 control_sequence_single_other_trailing_space,
748 "\\+ A",
749 ControlSequence("+", 0),
750 Character(' ', Space, 2),
751 Character('A', Letter, 3),
752 Character(' ', Space, 4),
753 ),
754 (
755 control_sequence_single_space_trailing_space,
756 "\\ A",
757 ControlSequence(" ", 0),
758 Character('A', Letter, 3),
759 Character(' ', Space, 4),
760 ),
761 (
762 comment_1,
763 "A%B\nC",
764 Character('A', Letter, 0),
765 NewLine,
766 Character('C', Letter, 4),
767 Character(' ', Space, 5),
768 ),
769 (
770 comment_1_with_space,
771 "A%B \nC",
772 Character('A', Letter, 0),
773 NewLine,
774 Character('C', Letter, 5),
775 Character(' ', Space, 6),
776 ),
777 (
778 comment_2,
779 "A%B\n%C\nD",
780 Character('A', Letter, 0),
781 NewLine,
782 NewLine,
783 Character('D', Letter, 7),
784 Character(' ', Space, 8),
785 ),
786 (comment_3, "A%a comment here", Character('A', Letter, 0),),
787 (
788 comment_4,
789 "A%\n B",
790 Character('A', Letter, 0),
791 NewLine,
792 Character('B', Letter, 4),
793 Character(' ', Space, 5),
794 ),
795 (
796 comment_5,
797 "A%\n\n B",
798 Character('A', Letter, 0),
799 NewLine,
800 ControlSequence("par", 3),
801 NewLine,
802 Character('B', Letter, 5),
803 Character(' ', Space, 6),
804 ),
805 (
806 comment_6,
807 "\\A %\nB",
808 ControlSequence("A", 0),
809 NewLine,
810 Character('B', Letter, 5),
811 Character(' ', Space, 6),
812 ),
813 (
814 texbook_exercise_8_2_e,
815 "A%\n B%",
816 Character('A', Letter, 0),
817 NewLine,
818 Character('B', Letter, 4),
819 ),
820 (
821 texbook_exercise_8_4,
822 r" $x^2$~ \Tex ^^C",
823 Character('$', MathShift, 1),
824 Character('x', Letter, 2),
825 Character('^', Superscript, 3),
826 Character('2', Other, 4),
827 Character('$', MathShift, 5),
828 Character('~', Active, 6),
829 Character(' ', Space, 7),
830 ControlSequence("Tex", 8),
831 Character('\u{3}', Other, 15),
832 Character(' ', Space, 16),
833 ),
834 (
835 texbook_exercise_8_5,
836 "Hi!\n\n\n",
837 Character('H', Letter, 0),
838 Character('i', Letter, 1),
839 Character('!', Other, 2),
840 Character(' ', Space, 3),
841 NewLine,
842 ControlSequence("par", 4),
843 NewLine,
844 ControlSequence("par", 5),
845 ),
846 (
847 double_space_creates_one_space,
848 "A B",
849 Character('A', Letter, 0),
850 Character(' ', Space, 1),
851 Character('B', Letter, 3),
852 Character(' ', Space, 4),
853 ),
854 (
855 single_newline_creates_one_space,
856 "A\nB",
857 Character('A', Letter, 0),
858 Character(' ', Space, 1),
859 NewLine,
860 Character('B', Letter, 2),
861 Character(' ', Space, 3),
862 ),
863 (
864 space_and_newline_creates_space,
865 "A \nB",
866 Character('A', Letter, 0),
867 Character(' ', Space, 1),
868 NewLine,
869 Character('B', Letter, 3),
870 Character(' ', Space, 4),
871 ),
872 (
873 par_1,
874 "A\n\nB",
875 Character('A', Letter, 0),
876 Character(' ', Space, 1),
877 NewLine,
878 ControlSequence("par", 2),
879 NewLine,
880 Character('B', Letter, 3),
881 Character(' ', Space, 4),
882 ),
883 (
884 par_2,
885 "A\n \nB",
886 Character('A', Letter, 0),
887 Character(' ', Space, 1),
888 NewLine,
889 ControlSequence("par", 2),
890 NewLine,
891 Character('B', Letter, 4),
892 Character(' ', Space, 5),
893 ),
894 (
895 par_3,
896 "A\n\n\nB",
897 Character('A', Letter, 0),
898 Character(' ', Space, 1),
899 NewLine,
900 ControlSequence("par", 2),
901 NewLine,
902 ControlSequence("par", 3),
903 NewLine,
904 Character('B', Letter, 4),
905 Character(' ', Space, 5),
906 ),
907 (
908 caret_notation_1,
909 "^^k",
910 Character('+', Other, 2),
911 Character(' ', Space, 3),
912 ),
913 (
914 caret_notation_2,
915 "^^+",
916 Character('k', Letter, 2),
917 Character(' ', Space, 3),
918 ),
919 (
920 caret_notation_3,
921 "^^+m",
922 Character('k', Letter, 2),
923 Character('m', Letter, 3),
924 Character(' ', Space, 4),
925 ),
926 (caret_notation_4, "^^\n", Character('M', Letter, 2),),
927 (caret_notation_5, "^^", Character('M', Letter, 2),),
928 (
929 caret_notation_6,
930 "^^\nA",
931 Character('M', Letter, 2),
932 NewLine,
933 Character('A', Letter, 3),
934 Character(' ', Space, 4),
935 ),
936 (
937 caret_notation_recursive_1,
938 "^^\u{1E}^+",
939 Character('k', Letter, 4),
940 Character(' ', Space, 5),
941 ),
942 (
943 caret_notation_recursive_2,
944 "\\^^\u{1E}^+",
945 ControlSequence("k", 0),
946 ),
947 (
948 caret_notation_recursive_3,
949 "\\j^^\u{1E}^+",
950 ControlSequence("jk", 0),
951 ),
952 (
953 caret_notation_recursive_4,
955 format!["\\^^{}+", "\u{1E}^".repeat(200)],
956 ControlSequence("k", 0),
957 ),
958 (
959 caret_notation_end_of_input_1,
960 "^^",
961 Character('M', Letter, 2),
962 ),
963 (
964 caret_notation_end_of_input_2,
965 "\\^^",
966 ControlSequence("M", 0),
967 ),
968 (
969 caret_notation_end_of_input_3,
970 "\\a^^",
971 ControlSequence("aM", 0),
972 ),
973 (
974 caret_notation_boundary_1,
975 "^^\u{00}",
976 Character(char::from_u32(0x40).unwrap(), Other, 2),
977 Character(' ', Space, 3),
978 ),
979 (
980 caret_notation_boundary_3,
981 "^^\u{40}",
982 ControlSequence("par", 3),
984 ),
985 (
986 caret_notation_boundary_4,
987 "^^\u{7F}",
988 Character(char::from_u32(0x3F).unwrap(), Other, 2),
989 Character(' ', Space, 3),
990 ),
991 (
992 caret_notation_cs_1,
993 r"\^^m",
994 ControlSequence("-", 0),
995 Character(' ', Space, 4),
996 ),
997 (
998 caret_notation_cs_2,
999 r"\^^ma",
1000 ControlSequence("-", 0),
1001 Character('a', Letter, 4),
1002 Character(' ', Space, 5),
1003 ),
1004 (caret_notation_cs_3, r"\^^-", ControlSequence("m", 0),),
1005 (caret_notation_cs_4, r"\^^-a", ControlSequence("ma", 0),),
1006 (
1007 caret_notation_cs_5,
1008 r"\^^-^^-+",
1009 ControlSequence("mm", 0),
1010 Character('+', Other, 7),
1011 Character(' ', Space, 8),
1012 ),
1013 (caret_notation_cs_6, r"\a^^-", ControlSequence("am", 0),),
1014 (
1015 caret_notation_cs_7,
1016 "\\^a",
1017 ControlSequence("^", 0),
1018 Character('a', Letter, 2),
1019 Character(' ', Space, 3),
1020 ),
1021 (
1022 caret_notation_cs_8,
1023 "\\a^a",
1024 ControlSequence("a", 0),
1025 Character('^', Superscript, 2),
1026 Character('a', Letter, 3),
1027 Character(' ', Space, 4),
1028 ),
1029 ];
1030
1031 lexer_tests![
1032 end_line_char(Some('\r')),
1033 cat_code_overrides(('Z', Ignored)),
1034 (
1035 control_sequence_single_ignored,
1036 r"\Z",
1037 ControlSequence("Z", 0),
1038 Character(' ', Space, 2),
1039 ),
1040 (ignored_character_1, "Z", ControlSequence("par", 1),),
1041 (
1042 ignored_character_2,
1043 "AZB",
1044 Character('A', Letter, 0),
1045 Character('B', Letter, 2),
1046 Character(' ', Space, 3),
1047 ),
1048 (
1049 texbook_exercise_8_2_f,
1050 r"\AZB",
1051 ControlSequence("A", 0),
1052 Character('B', Letter, 3),
1053 Character(' ', Space, 4),
1054 ),
1055 ];
1056
1057 lexer_tests![
1058 end_line_char(Some('\r')),
1059 cat_code_overrides(('W', Invalid)),
1060 (
1061 control_sequence_single_invalid,
1062 r"\W",
1063 ControlSequence("W", 0),
1064 Character(' ', Space, 2),
1065 ),
1066 ];
1067
1068 lexer_tests![
1069 end_line_char(Some('\r')),
1070 cat_code_overrides(('X', EndOfLine)),
1071 (
1072 non_standard_newline_character,
1073 "AXB",
1074 Character('A', Letter, 0),
1075 Character(' ', Space, 1),
1076 ),
1077 (
1078 non_standard_newline_character_2,
1079 "AXXB",
1080 Character('A', Letter, 0),
1081 Character(' ', Space, 1),
1082 ),
1083 (
1084 non_standard_newline_character_after_cs,
1085 r"\A XB",
1086 ControlSequence("A", 0),
1087 ),
1088 (single_non_standard_newline, "X", ControlSequence("par", 0),),
1089 ];
1090
1091 lexer_tests![
1092 end_line_char(Some('\r')),
1093 cat_code_overrides(('Y', Space)),
1094 (
1095 non_standard_whitespace_1,
1096 "AYB",
1097 Character('A', Letter, 0),
1098 Character(' ', Space, 1),
1099 Character('B', Letter, 2),
1100 Character(' ', Space, 3),
1101 ),
1102 ];
1103
1104 lexer_tests![
1105 end_line_char(Some('\r')),
1106 cat_code_overrides(
1107 ('\u{01}', Escape),
1108 ('\u{02}', Superscript),
1109 ('\u{03}', Space),
1110 ('\u{0D}', Letter),
1111 ),
1112 (
1113 texbook_exercise_8_6,
1114 r"^^B^^BM^^A^^B^^C^^M^^@\M ",
1115 Character('\u{02}', Superscript, 2),
1116 Character('\u{02}', Superscript, 5),
1117 Character('M', Letter, 6),
1118 ControlSequence("\u{02}", 9),
1119 Character(' ', Space, 15),
1120 Character('\u{0D}', Letter, 18),
1121 ControlSequence("M\u{0D}", 22),
1122 ),
1123 ];
1124
1125 lexer_tests![
1126 end_line_char(Some('B')),
1127 cat_code_overrides(),
1128 (
1129 control_sequence_includes_end_line_char_1,
1130 r"\A",
1131 ControlSequence("AB", 0),
1132 ),
1133 (
1134 control_sequence_includes_end_line_char_2,
1135 r"\A ",
1136 ControlSequence("AB", 0),
1137 ),
1138 (
1139 control_sequence_includes_end_line_char_3,
1140 r"\",
1141 ControlSequence("B", 0),
1142 ),
1143 (
1144 control_sequence_includes_end_line_char_4,
1145 r"\ ",
1146 ControlSequence("B", 0),
1147 ),
1148 (
1149 control_sequence_does_not_span_lines,
1150 "\\A\nC",
1151 ControlSequence("AB", 0),
1152 NewLine,
1153 Character('C', Letter, 3),
1154 Character('B', Letter, 4),
1155 ),
1156 (
1157 repeated_end_line_char_1,
1158 "\n\n\n",
1159 Character('B', Letter, 0),
1160 NewLine,
1161 Character('B', Letter, 1),
1162 NewLine,
1163 Character('B', Letter, 2),
1164 ),
1165 (
1166 repeated_end_line_char_2,
1167 "A\nA\nA\n",
1168 Character('A', Letter, 0),
1169 Character('B', Letter, 1),
1170 NewLine,
1171 Character('A', Letter, 2),
1172 Character('B', Letter, 3),
1173 NewLine,
1174 Character('A', Letter, 4),
1175 Character('B', Letter, 5),
1176 ),
1177 (
1178 right_side_trimming,
1179 "A \nA \n",
1180 Character('A', Letter, 0),
1181 Character('B', Letter, 1),
1182 NewLine,
1183 Character('A', Letter, 4),
1184 Character('B', Letter, 5),
1185 ),
1186 (
1187 left_side_trimming,
1188 "A\n A\n",
1189 Character('A', Letter, 0),
1190 Character('B', Letter, 1),
1191 NewLine,
1192 Character('A', Letter, 3),
1193 Character('B', Letter, 4),
1194 ),
1195 ];
1196
1197 lexer_tests![
1198 end_line_char(None),
1199 cat_code_overrides(),
1200 (
1201 multiple_skipped_lines,
1202 "A\n\n\nB",
1203 Character('A', Letter, 0),
1204 NewLine,
1205 NewLine,
1206 NewLine,
1207 Character('B', Letter, 4),
1208 ),
1209 (
1210 empty_cs_name,
1211 "\\\nB",
1212 ControlSequence("", 0),
1213 NewLine,
1214 Character('B', Letter, 2),
1215 ),
1216 ];
1217}