locus_core_rs/parsing/
lexer.rs1use super::lexicon::{
2 CONTENT_MARKER, ENVELOPE_MARKER, LAYER_STOP_MARKER, METRICS_MARKER, PROVENANCE_MARKER,
3};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub struct Span {
7 pub start: usize,
8 pub end: usize,
9 pub line: usize,
10 pub column: usize,
11}
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum TokenKind {
15 ProvenanceStart,
16 EnvelopeStart,
17 ContentStart,
18 MetricsStart,
19 LayerEnd,
20 LBrace,
21 RBrace,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct Token {
26 pub kind: TokenKind,
27 pub span: Span,
28}
29
30pub fn tokenize(input: &str) -> Vec<Token> {
31 let mut tokens = Vec::new();
32
33 let mut index = 0usize;
34 let mut line = 1usize;
35 let mut column = 1usize;
36
37 while index < input.len() {
38 let rest = &input[index..];
39
40 if let Some((kind, marker)) = match_structural_marker(rest) {
41 let len = marker.len();
42 tokens.push(Token {
43 kind,
44 span: Span {
45 start: index,
46 end: index + len,
47 line,
48 column,
49 },
50 });
51
52 advance_position(marker, &mut line, &mut column);
53 index += len;
54 continue;
55 }
56
57 let Some(ch) = rest.chars().next() else {
58 break;
59 };
60
61 let ch_len = ch.len_utf8();
62 match ch {
63 '{' => tokens.push(Token {
64 kind: TokenKind::LBrace,
65 span: Span {
66 start: index,
67 end: index + ch_len,
68 line,
69 column,
70 },
71 }),
72 '}' => tokens.push(Token {
73 kind: TokenKind::RBrace,
74 span: Span {
75 start: index,
76 end: index + ch_len,
77 line,
78 column,
79 },
80 }),
81 _ => {}
82 }
83
84 if ch == '\n' {
85 line += 1;
86 column = 1;
87 } else {
88 column += 1;
89 }
90 index += ch_len;
91 }
92
93 tokens
94}
95
96fn match_structural_marker(rest: &str) -> Option<(TokenKind, &'static str)> {
97 if rest.starts_with(PROVENANCE_MARKER) {
98 return Some((TokenKind::ProvenanceStart, PROVENANCE_MARKER));
99 }
100 if rest.starts_with(ENVELOPE_MARKER) {
101 return Some((TokenKind::EnvelopeStart, ENVELOPE_MARKER));
102 }
103 if rest.starts_with(CONTENT_MARKER) {
104 return Some((TokenKind::ContentStart, CONTENT_MARKER));
105 }
106 if rest.starts_with(METRICS_MARKER) {
107 return Some((TokenKind::MetricsStart, METRICS_MARKER));
108 }
109 if rest.starts_with(LAYER_STOP_MARKER) {
110 return Some((TokenKind::LayerEnd, LAYER_STOP_MARKER));
111 }
112
113 None
114}
115
116fn advance_position(text: &str, line: &mut usize, column: &mut usize) {
117 for ch in text.chars() {
118 if ch == '\n' {
119 *line += 1;
120 *column = 1;
121 } else {
122 *column += 1;
123 }
124 }
125}
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130
131 #[test]
132 fn should_tokenize_structural_markers() {
133 let raw = "⊕⟨ { a: 1 } ⟩\n⦿⟨ { b: 2 } ⟩\n◈⟨ { c(.9): x } ⟩\n⍉⟨ { rho: 1 } ⟩";
134 let tokens = tokenize(raw);
135
136 assert!(tokens.iter().any(|t| t.kind == TokenKind::ProvenanceStart));
137 assert!(tokens.iter().any(|t| t.kind == TokenKind::EnvelopeStart));
138 assert!(tokens.iter().any(|t| t.kind == TokenKind::ContentStart));
139 assert!(tokens.iter().any(|t| t.kind == TokenKind::MetricsStart));
140 assert!(tokens.iter().any(|t| t.kind == TokenKind::LayerEnd));
141 }
142
143 #[test]
144 fn should_track_line_and_column() {
145 let raw = "x\n⊕⟨ { a: 1 } ⟩";
146 let tokens = tokenize(raw);
147 let provenance = tokens
148 .iter()
149 .find(|t| t.kind == TokenKind::ProvenanceStart)
150 .expect("provenance marker should exist");
151
152 assert_eq!(provenance.span.line, 2);
153 assert_eq!(provenance.span.column, 1);
154 }
155}