Skip to main content

locus_core_rs/parsing/
lexer.rs

1use super::lexicon::{
2    CONTENT_MARKER, ENVELOPE_MARKER, LAYER_STOP_MARKER, METRICS_MARKER, PROVENANCE_MARKER,
3};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub struct Span {
7    pub start: usize,
8    pub end: usize,
9    pub line: usize,
10    pub column: usize,
11}
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum TokenKind {
15    ProvenanceStart,
16    EnvelopeStart,
17    ContentStart,
18    MetricsStart,
19    LayerEnd,
20    LBrace,
21    RBrace,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct Token {
26    pub kind: TokenKind,
27    pub span: Span,
28}
29
30pub fn tokenize(input: &str) -> Vec<Token> {
31    let mut tokens = Vec::new();
32
33    let mut index = 0usize;
34    let mut line = 1usize;
35    let mut column = 1usize;
36
37    while index < input.len() {
38        let rest = &input[index..];
39
40        if let Some((kind, marker)) = match_structural_marker(rest) {
41            let len = marker.len();
42            tokens.push(Token {
43                kind,
44                span: Span {
45                    start: index,
46                    end: index + len,
47                    line,
48                    column,
49                },
50            });
51
52            advance_position(marker, &mut line, &mut column);
53            index += len;
54            continue;
55        }
56
57        let Some(ch) = rest.chars().next() else {
58            break;
59        };
60
61        let ch_len = ch.len_utf8();
62        match ch {
63            '{' => tokens.push(Token {
64                kind: TokenKind::LBrace,
65                span: Span {
66                    start: index,
67                    end: index + ch_len,
68                    line,
69                    column,
70                },
71            }),
72            '}' => tokens.push(Token {
73                kind: TokenKind::RBrace,
74                span: Span {
75                    start: index,
76                    end: index + ch_len,
77                    line,
78                    column,
79                },
80            }),
81            _ => {}
82        }
83
84        if ch == '\n' {
85            line += 1;
86            column = 1;
87        } else {
88            column += 1;
89        }
90        index += ch_len;
91    }
92
93    tokens
94}
95
96fn match_structural_marker(rest: &str) -> Option<(TokenKind, &'static str)> {
97    if rest.starts_with(PROVENANCE_MARKER) {
98        return Some((TokenKind::ProvenanceStart, PROVENANCE_MARKER));
99    }
100    if rest.starts_with(ENVELOPE_MARKER) {
101        return Some((TokenKind::EnvelopeStart, ENVELOPE_MARKER));
102    }
103    if rest.starts_with(CONTENT_MARKER) {
104        return Some((TokenKind::ContentStart, CONTENT_MARKER));
105    }
106    if rest.starts_with(METRICS_MARKER) {
107        return Some((TokenKind::MetricsStart, METRICS_MARKER));
108    }
109    if rest.starts_with(LAYER_STOP_MARKER) {
110        return Some((TokenKind::LayerEnd, LAYER_STOP_MARKER));
111    }
112
113    None
114}
115
116fn advance_position(text: &str, line: &mut usize, column: &mut usize) {
117    for ch in text.chars() {
118        if ch == '\n' {
119            *line += 1;
120            *column = 1;
121        } else {
122            *column += 1;
123        }
124    }
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn should_tokenize_structural_markers() {
133        let raw = "⊕⟨ { a: 1 } ⟩\n⦿⟨ { b: 2 } ⟩\n◈⟨ { c(.9): x } ⟩\n⍉⟨ { rho: 1 } ⟩";
134        let tokens = tokenize(raw);
135
136        assert!(tokens.iter().any(|t| t.kind == TokenKind::ProvenanceStart));
137        assert!(tokens.iter().any(|t| t.kind == TokenKind::EnvelopeStart));
138        assert!(tokens.iter().any(|t| t.kind == TokenKind::ContentStart));
139        assert!(tokens.iter().any(|t| t.kind == TokenKind::MetricsStart));
140        assert!(tokens.iter().any(|t| t.kind == TokenKind::LayerEnd));
141    }
142
143    #[test]
144    fn should_track_line_and_column() {
145        let raw = "x\n⊕⟨ { a: 1 } ⟩";
146        let tokens = tokenize(raw);
147        let provenance = tokens
148            .iter()
149            .find(|t| t.kind == TokenKind::ProvenanceStart)
150            .expect("provenance marker should exist");
151
152        assert_eq!(provenance.span.line, 2);
153        assert_eq!(provenance.span.column, 1);
154    }
155}