diff --git a/Cargo.toml b/Cargo.toml index 824fca6..67e691e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,11 @@ [workspace] -members = ["utils", "utils-aio"] +members = ["utils", "utils-aio", "spansy"] [workspace.dependencies] +tlsn-utils = { path = "utils" } +tlsn-utils-aio = { path = "utils-aio" } +spansy = { path = "spansy" } + rand = "0.8" thiserror = "1" async-trait = "0.1" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..1b5ec8b --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..91590a0 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 sinu <65924192+sinui0@users.noreply.github.com> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/spansy/.gitignore b/spansy/.gitignore new file mode 100644 index 0000000..c2548ae --- /dev/null +++ b/spansy/.gitignore @@ -0,0 +1,4 @@ +/target +.cargo-ok +.DS_Store +Cargo.lock \ No newline at end of file diff --git a/spansy/CHANGELOG.md b/spansy/CHANGELOG.md new file mode 100644 index 0000000..ef88a6b --- /dev/null +++ b/spansy/CHANGELOG.md @@ -0,0 +1,7 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] diff --git a/spansy/Cargo.toml b/spansy/Cargo.toml new file mode 100644 index 0000000..f774a09 --- /dev/null +++ b/spansy/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "spansy" +version = "0.1.0" +edition = "2021" +description = "Parsing with span information" +repository = "https://github.com/tlsnotary/tlsn-utils" +license = "MIT OR Apache-2.0" + +[features] +default = [] +serde = ["dep:serde", "bytes/serde"] + +[dependencies] +tlsn-utils.workspace = true + +bytes.workspace = true +serde = { workspace = true, features = ["derive"], optional = true } +thiserror.workspace = true + +httparse = "1.8" +pest = { version = "2.7" } +pest_derive = { version = "2.7" } diff --git a/spansy/README.md b/spansy/README.md new file mode 100644 index 0000000..290d0fb --- /dev/null +++ b/spansy/README.md @@ -0,0 +1,3 @@ +# spansy + +Crate for parsing span information from common formats such as HTTP and JSON. diff --git a/spansy/src/helpers.rs b/spansy/src/helpers.rs new file mode 100644 index 0000000..7e01fe6 --- /dev/null +++ b/spansy/src/helpers.rs @@ -0,0 +1,51 @@ +use std::ops::Range; + +/// Returns the range within the source string corresponding to the span. +/// +/// # Panics +/// +/// Panics if the span is not within the source string. +pub(crate) fn get_span_range(src: &[u8], span: &[u8]) -> Range { + let src_start = src.as_ptr() as usize; + let src_end = src_start + src.len(); + let span_start = span.as_ptr() as usize; + let span_end = span_start + span.len(); + + assert!( + span_start >= src_start && span_end <= src_end, + "span is not within source string: src={src_start}..{src_end}, span={span_start}..{span_end}" + ); + + span_start - src_start..span_end - src_start +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_span_range() { + let src = b"foobar"; + + assert_eq!(get_span_range(src, &src[..]), 0..src.len()); + assert_eq!(get_span_range(src, &src[0..1]), 0..1); + assert_eq!(get_span_range(src, &src[1..2]), 1..2); + assert_eq!(get_span_range(src, &src[3..6]), 3..6); + } + + #[test] + #[should_panic] + fn test_get_span_range_outside_src_begin() { + let src = b"foobar"; + + get_span_range(&src[1..3], &src[..3]); + } + + #[test] + #[should_panic] + fn test_get_span_range_outside_src_end() { + let src = b"foobar"; + + get_span_range(&src[1..3], &src[2..]); + } +} diff --git a/spansy/src/http/mod.rs b/spansy/src/http/mod.rs new file mode 100644 index 0000000..9080a20 --- /dev/null +++ b/spansy/src/http/mod.rs @@ -0,0 +1,251 @@ +//! HTTP span parsing. + +mod span; +mod types; + +use bytes::Bytes; + +pub use span::{parse_request, parse_response}; +pub use types::{ + Body, Code, Header, HeaderName, HeaderValue, Method, Reason, Request, RequestLine, Response, + Status, Target, +}; + +use crate::ParseError; + +use self::span::{parse_request_from_bytes, parse_response_from_bytes}; +/// An iterator yielding parsed HTTP requests. +#[derive(Debug)] +pub struct Requests { + src: Bytes, + /// The current position in the source string. + pos: usize, +} + +impl Requests { + /// Returns a new `Requests` iterator. + pub fn new(src: Bytes) -> Self { + Self { src, pos: 0 } + } + + /// Returns a new `Requests` iterator. + pub fn new_from_slice(src: &[u8]) -> Self { + Self { + src: Bytes::copy_from_slice(src), + pos: 0, + } + } +} + +impl Iterator for Requests { + type Item = Result; + + fn next(&mut self) -> Option { + if self.pos >= self.src.len() { + None + } else { + Some(parse_request_from_bytes(&self.src, self.pos).map(|req| { + self.pos += req.span.len(); + req + })) + } + } +} + +/// An iterator yielding parsed HTTP responses. +#[derive(Debug)] +pub struct Responses { + src: Bytes, + /// The current position in the source string. + pos: usize, +} + +impl Responses { + /// Returns a new `Responses` iterator. + pub fn new(src: Bytes) -> Self { + Self { src, pos: 0 } + } + + /// Returns a new `Responses` iterator. + pub fn new_from_slice(src: &[u8]) -> Self { + Self { + src: Bytes::copy_from_slice(src), + pos: 0, + } + } +} + +impl Iterator for Responses { + type Item = Result; + + fn next(&mut self) -> Option { + if self.pos >= self.src.len() { + None + } else { + Some(parse_response_from_bytes(&self.src, self.pos).map(|resp| { + self.pos += resp.span.len(); + resp + })) + } + } +} + +#[cfg(test)] +mod tests { + use crate::Spanned; + + use super::*; + + const MULTIPLE_REQUESTS: &[u8] = b"GET / HTTP/1.1\r\nHost: localhost\r\n\r\n\ + POST /hello HTTP/1.1\r\nHost: localhost\r\nContent-Length: 14\r\n\r\n\ + Hello, world!\n"; + + const MULTIPLE_RESPONSES: &[u8] = b"HTTP/1.1 200 OK\r\nContent-Length: 0\r\n\r\n\ + HTTP/1.1 200 OK\r\nContent-Length: 14\r\n\r\nHello, world!\n\ + HTTP/1.1 204 OK\r\nContent-Length: 0\r\n\r\n"; + + #[test] + fn test_parse_requests() { + let reqs = Requests::new_from_slice(MULTIPLE_REQUESTS) + .collect::, _>>() + .unwrap(); + + assert_eq!(reqs.len(), 2); + + assert_eq!(reqs[0].request.method.as_str(), "GET"); + assert!(reqs[0].body.is_none()); + assert_eq!( + reqs[0] + .headers_with_name("host") + .next() + .unwrap() + .value + .as_bytes(), + b"localhost" + ); + + assert_eq!(reqs[1].request.method.as_str(), "POST"); + assert_eq!( + reqs[1] + .headers_with_name("host") + .next() + .unwrap() + .value + .as_bytes(), + b"localhost" + ); + assert_eq!( + reqs[1] + .headers_with_name("content-length") + .next() + .unwrap() + .value + .as_bytes(), + b"14" + ); + assert_eq!( + reqs[1].body.as_ref().unwrap().span(), + b"Hello, world!\n".as_slice() + ); + } + + #[test] + fn test_parse_responses() { + let resps = Responses::new_from_slice(MULTIPLE_RESPONSES) + .collect::, _>>() + .unwrap(); + + assert_eq!(resps.len(), 3); + + assert_eq!(resps[0].status.code.as_str(), "200"); + assert_eq!( + resps[0] + .headers_with_name("content-length") + .next() + .unwrap() + .value + .as_bytes(), + b"0" + ); + assert!(resps[0].body.is_none()); + + assert_eq!(resps[1].status.code.as_str(), "200"); + assert_eq!( + resps[1] + .headers_with_name("content-length") + .next() + .unwrap() + .value + .as_bytes(), + b"14" + ); + assert_eq!( + resps[1].body.as_ref().unwrap().span(), + b"Hello, world!\n".as_slice() + ); + + assert_eq!(resps[2].status.code.as_str(), "204"); + assert_eq!( + resps[2] + .headers_with_name("content-length") + .next() + .unwrap() + .value + .as_bytes(), + b"0" + ); + assert!(resps[2].body.is_none()); + } + + #[test] + fn test_parse_request_duplicate_headers() { + let req_bytes = b"GET / HTTP/1.1\r\nHost: localhost\r\nAccept: application/json\r\n\ + Accept: application/xml\r\n\r\n"; + let reqs = Requests::new_from_slice(req_bytes) + .collect::, _>>() + .unwrap(); + + assert_eq!(reqs.len(), 1); + let req = reqs.first().unwrap(); + + let headers: Vec<_> = req.headers_with_name("host").collect(); + assert_eq!(headers.len(), 1); + assert_eq!(headers.first().unwrap().value.as_bytes(), b"localhost"); + + let headers: Vec<_> = req.headers_with_name("accept").collect(); + assert_eq!(headers.len(), 2); + assert_eq!( + headers + .iter() + .map(|h| h.value.as_bytes()) + .collect::>(), + vec!["application/json".as_bytes(), "application/xml".as_bytes()], + ); + } + + #[test] + fn test_parse_response_duplicate_headers() { + let resp_bytes = b"HTTP/1.1 200 OK\r\nSet-Cookie: lang=en; Path=/\r\n\ + Set-Cookie: fang=fen; Path=/\r\nContent-Length: 14\r\n\r\n{\"foo\": \"bar\"}"; + let resps = Responses::new_from_slice(resp_bytes) + .collect::, _>>() + .unwrap(); + + assert_eq!(resps.len(), 1); + let resp = resps.first().unwrap(); + + let headers: Vec<_> = resp.headers_with_name("set-cookie").collect(); + assert_eq!(headers.len(), 2); + assert_eq!( + headers + .iter() + .map(|h| h.value.as_bytes()) + .collect::>(), + vec!["lang=en; Path=/".as_bytes(), "fang=fen; Path=/".as_bytes()], + ); + + let headers: Vec<_> = resp.headers_with_name("content-length").collect(); + assert_eq!(headers.len(), 1); + assert_eq!(headers.first().unwrap().value.as_bytes(), b"14"); + } +} diff --git a/spansy/src/http/span.rs b/spansy/src/http/span.rs new file mode 100644 index 0000000..9bf7ef7 --- /dev/null +++ b/spansy/src/http/span.rs @@ -0,0 +1,435 @@ +use bytes::Bytes; + +use crate::{ + helpers::get_span_range, + http::{ + Body, Code, Header, HeaderName, HeaderValue, Method, Reason, Request, RequestLine, + Response, Status, Target, + }, + ParseError, Span, +}; + +const MAX_HEADERS: usize = 128; + +/// Parses an HTTP request. +pub fn parse_request(src: &[u8]) -> Result { + parse_request_from_bytes(&Bytes::copy_from_slice(src), 0) +} + +/// Parses an HTTP request from a `Bytes` buffer starting from the `offset`. +pub(crate) fn parse_request_from_bytes(src: &Bytes, offset: usize) -> Result { + let mut headers = [httparse::EMPTY_HEADER; MAX_HEADERS]; + + let (method, path, head_end) = { + let mut request = httparse::Request::new(&mut headers); + + let head_end = match request.parse(&src[offset..]) { + Ok(httparse::Status::Complete(head_end)) => head_end + offset, + Ok(httparse::Status::Partial) => { + return Err(ParseError(format!("incomplete request: {:?}", src))) + } + Err(err) => return Err(ParseError(err.to_string())), + }; + + let method = request + .method + .ok_or_else(|| ParseError("method missing from request".to_string()))?; + + let path = request + .path + .ok_or_else(|| ParseError("path missing from request".to_string()))?; + + (method, path, head_end) + }; + + let request_line_end = src[offset..] + .windows(2) + .position(|w| w == b"\r\n") + .expect("request line is terminated with CRLF"); + let request_line_range = offset..offset + request_line_end + 2; + + let headers = headers + .iter() + .take_while(|h| *h != &httparse::EMPTY_HEADER) + .map(|header| from_header(src, header)) + .collect(); + + // httparse allocates a new buffer to store the method for performance reasons, + // so we have to search for the span in the source. This is quick as the method + // is at the front. + let method = src[offset..] + .windows(method.len()) + .find(|w| *w == method.as_bytes()) + .expect("method is present"); + + let mut request = Request { + span: Span::new_bytes(src.clone(), offset..head_end), + request: RequestLine { + span: Span::new_str(src.clone(), request_line_range), + method: Method(Span::new_str(src.clone(), get_span_range(src, method))), + target: Target(Span::new_from_str(src.clone(), path)), + }, + headers, + body: None, + }; + + let body_len = request_body_len(&request)?; + + if body_len > 0 { + let range = head_end..head_end + body_len; + + if range.end > src.len() { + return Err(ParseError(format!( + "body range {}..{} exceeds source {}", + range.start, + range.end, + src.len() + ))); + } + + request.span = Span::new_bytes(src.clone(), offset..range.end); + + request.body = Some(Body { + span: Span::new_bytes(src.clone(), range), + }); + } + + Ok(request) +} + +/// Parses an HTTP response. +pub fn parse_response(src: &[u8]) -> Result { + parse_response_from_bytes(&Bytes::copy_from_slice(src), 0) +} + +/// Parses an HTTP response from a `Bytes` buffer starting from the `offset`. +pub(crate) fn parse_response_from_bytes( + src: &Bytes, + offset: usize, +) -> Result { + let mut headers = [httparse::EMPTY_HEADER; MAX_HEADERS]; + + let (reason, code, head_end) = { + let mut response = httparse::Response::new(&mut headers); + + let head_end = match response.parse(&src[offset..]) { + Ok(httparse::Status::Complete(head_end)) => head_end + offset, + Ok(httparse::Status::Partial) => { + return Err(ParseError(format!("incomplete response: {:?}", src))) + } + Err(err) => return Err(ParseError(err.to_string())), + }; + + let code = response + .code + .ok_or_else(|| ParseError("code missing from response".to_string())) + .map(|c| c.to_string())?; + + let reason = response + .reason + .ok_or_else(|| ParseError("reason missing from response".to_string()))?; + + (reason, code, head_end) + }; + + let status_line_end = src[offset..] + .windows(2) + .position(|w| w == b"\r\n") + .expect("status line is terminated with CRLF"); + let status_line_range = offset..offset + status_line_end + 2; + + let headers = headers + .iter() + .take_while(|h| *h != &httparse::EMPTY_HEADER) + .map(|header| from_header(src, header)) + .collect(); + + // httparse doesn't preserve the response code span, so we find it. + let code = src[offset..] + .windows(3) + .find(|w| *w == code.as_bytes()) + .expect("code is present"); + + let mut response = Response { + span: Span::new_bytes(src.clone(), offset..head_end), + status: Status { + span: Span::new_str(src.clone(), status_line_range), + code: Code(Span::new_str(src.clone(), get_span_range(src, code))), + reason: Reason(Span::new_from_str(src.clone(), reason)), + }, + headers, + body: None, + }; + + let body_len = response_body_len(&response)?; + + if body_len > 0 { + let range = head_end..head_end + body_len; + + if range.end > src.len() { + return Err(ParseError(format!( + "body range {}..{} exceeds source {}", + range.start, + range.end, + src.len() + ))); + } + + response.span = Span::new_bytes(src.clone(), offset..range.end); + + response.body = Some(Body { + span: Span::new_bytes(src.clone(), range), + }); + } + + Ok(response) +} + +/// Converts a `httparse::Header` to a `Header`. +fn from_header(src: &Bytes, header: &httparse::Header) -> Header { + let name_range = get_span_range(src, header.name.as_bytes()); + let value_range = get_span_range(src, header.value); + + let crlf_idx = src[value_range.end..] + .windows(2) + .position(|b| b == b"\r\n") + .expect("CRLF is present in a valid header"); + + // Capture the entire header including trailing whitespace and the CRLF. + let header_range = name_range.start..value_range.end + crlf_idx + 2; + + Header { + span: Span::new_bytes(src.clone(), header_range), + name: HeaderName(Span::new_str(src.clone(), name_range)), + value: HeaderValue(Span::new_bytes(src.clone(), value_range)), + } +} + +/// Calculates the length of the request body according to RFC 9112, section 6. +fn request_body_len(request: &Request) -> Result { + // The presence of a message body in a request is signaled by a Content-Length + // or Transfer-Encoding header field. + + // If a message is received with both a Transfer-Encoding and a Content-Length header field, + // the Transfer-Encoding overrides the Content-Length + if request + .headers_with_name("Transfer-Encoding") + .next() + .is_some() + { + Err(ParseError( + "Transfer-Encoding not supported yet".to_string(), + )) + } else if let Some(h) = request.headers_with_name("Content-Length").next() { + // If a valid Content-Length header field is present without Transfer-Encoding, its decimal value + // defines the expected message body length in octets. + std::str::from_utf8(h.value.0.as_bytes())? + .parse::() + .map_err(|err| ParseError(format!("failed to parse Content-Length value: {err}"))) + } else { + // If this is a request message and none of the above are true, then the message body length is zero + Ok(0) + } +} + +/// Calculates the length of the response body according to RFC 9112, section 6. +fn response_body_len(response: &Response) -> Result { + // Any response to a HEAD request and any response with a 1xx (Informational), 204 (No Content), or 304 (Not Modified) + // status code is always terminated by the first empty line after the header fields, regardless of the header fields + // present in the message, and thus cannot contain a message body or trailer section. + match response + .status + .code + .as_str() + .parse::() + .expect("code is valid utf-8") + { + 100..=199 | 204 | 304 => return Ok(0), + _ => {} + } + + if response + .headers_with_name("Transfer-Encoding") + .next() + .is_some() + { + Err(ParseError( + "Transfer-Encoding not supported yet".to_string(), + )) + } else if let Some(h) = response.headers_with_name("Content-Length").next() { + // If a valid Content-Length header field is present without Transfer-Encoding, its decimal value + // defines the expected message body length in octets. + std::str::from_utf8(h.value.0.as_bytes())? + .parse::() + .map_err(|err| ParseError(format!("failed to parse Content-Length value: {err}"))) + } else { + // If this is a response message and none of the above are true, then there is no way to + // determine the length of the message body except by reading it until the connection is closed. + + // We currently consider this an error because we have no outer context information. + Err(ParseError( + "A response with a body must contain either a Content-Length or Transfer-Encoding header".to_string(), + )) + } +} + +#[cfg(test)] +mod tests { + use crate::Spanned; + + use super::*; + + const TEST_REQUEST: &[u8] = b"\ + GET /home.html HTTP/1.1\r\n\ + Host: developer.mozilla.org\r\n\ + User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0\r\n\ + Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.\r\n\ + Accept-Language: en-US,en;q=0.\r\n\ + Accept-Encoding: gzip, deflate, b\r\n\ + Referer: https://developer.mozilla.org/testpage.htm\r\n\ + Connection: keep-alive\r\n\ + Content-Length: 12\r\n\ + Cache-Control: max-age=0\r\n\r\n\ + Hello World!"; + + const TEST_RESPONSE: &[u8] = b"\ + HTTP/1.1 200 OK\r\n\ + Date: Mon, 27 Jul 2009 12:28:53 GMT\r\n\ + Server: Apache/2.2.14 (Win32)\r\n\ + Last-Modified: Wed, 22 Jul 2009 19:15:56 GMT\r\n\ + Content-Length: 52\r\n\ + Content-Type: text/html\r\n\ + Connection: Closed\r\n\r\n\ + \n\ + \n\ +

Hello, World!

\n\ + \n\ + "; + + const TEST_REQUEST2: &[u8] = b"\ + GET /info.html HTTP/1.1\r\n\ + Host: tlsnotary.org\r\n\ + User-Agent: client\r\n\ + Content-Length: 4\r\n\r\n\ + ping"; + + const TEST_RESPONSE2: &[u8] = b"\ + HTTP/1.1 200 OK\r\n\ + Server: server\r\n\ + Content-Length: 4\r\n\ + Content-Type: text/plain\r\n\ + Connection: keep-alive\r\n\r\n\ + pong"; + + #[test] + fn test_parse_request() { + let req = parse_request(TEST_REQUEST).unwrap(); + + assert_eq!(req.span(), TEST_REQUEST); + assert_eq!(req.request.method.as_str(), "GET"); + assert_eq!( + req.headers_with_name("Host").next().unwrap().value.span(), + b"developer.mozilla.org".as_slice() + ); + assert_eq!( + req.headers_with_name("User-Agent") + .next() + .unwrap() + .value + .span(), + b"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0" + .as_slice() + ); + assert_eq!(req.body.unwrap().span(), b"Hello World!".as_slice()); + } + + #[test] + fn test_parse_header_trailing_whitespace() { + let req = parse_request(b"GET / HTTP/1.1\r\nHost: example.com \r\n\r\n").unwrap(); + let header = req.headers_with_name("Host").next().unwrap(); + + assert_eq!(header.span.as_bytes(), b"Host: example.com \r\n".as_slice()); + } + + #[test] + fn test_parse_response() { + let res = parse_response(TEST_RESPONSE).unwrap(); + + assert_eq!(res.span(), TEST_RESPONSE); + assert_eq!(res.status.code.as_str(), "200"); + assert_eq!(res.status.reason.as_str(), "OK"); + assert_eq!( + res.headers_with_name("Server").next().unwrap().value.span(), + b"Apache/2.2.14 (Win32)".as_slice() + ); + assert_eq!( + res.headers_with_name("Connection") + .next() + .unwrap() + .value + .span(), + b"Closed".as_slice() + ); + assert_eq!( + res.body.unwrap().span(), + b"\n\n

Hello, World!

\n\n".as_slice() + ); + } + + // Make sure the first request is not parsed. + #[test] + fn test_parse_request_from_bytes() { + let mut request = Vec::new(); + request.extend(TEST_REQUEST2); + request.extend(TEST_REQUEST); + let request = Bytes::copy_from_slice(&request); + let req = parse_request_from_bytes(&request, TEST_REQUEST2.len()).unwrap(); + + assert_eq!(req.span(), TEST_REQUEST); + assert_eq!(req.request.method.as_str(), "GET"); + assert_eq!( + req.headers_with_name("Host").next().unwrap().value.span(), + b"developer.mozilla.org".as_slice() + ); + assert_eq!( + req.headers_with_name("User-Agent") + .next() + .unwrap() + .value + .span(), + b"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0" + .as_slice() + ); + assert_eq!(req.body.unwrap().span(), b"Hello World!".as_slice()); + } + + // Make sure the first response is not parsed. + #[test] + fn test_parse_response_from_bytes() { + let mut response = Vec::new(); + response.extend(TEST_RESPONSE2); + response.extend(TEST_RESPONSE); + let response = Bytes::copy_from_slice(&response); + let res = parse_response_from_bytes(&response, TEST_RESPONSE2.len()).unwrap(); + + assert_eq!(res.span(), TEST_RESPONSE); + assert_eq!(res.status.code.as_str(), "200"); + assert_eq!(res.status.reason.as_str(), "OK"); + assert_eq!( + res.headers_with_name("Server").next().unwrap().value.span(), + b"Apache/2.2.14 (Win32)".as_slice() + ); + assert_eq!( + res.headers_with_name("Connection") + .next() + .unwrap() + .value + .span(), + b"Closed".as_slice() + ); + assert_eq!( + res.body.unwrap().span(), + b"\n\n

Hello, World!

\n\n".as_slice() + ); + } +} diff --git a/spansy/src/http/types.rs b/spansy/src/http/types.rs new file mode 100644 index 0000000..4b3ddcb --- /dev/null +++ b/spansy/src/http/types.rs @@ -0,0 +1,368 @@ +use utils::range::{RangeDifference, RangeSet}; + +use crate::{Span, Spanned}; + +/// An HTTP header name. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct HeaderName(pub(crate) Span); + +impl HeaderName { + /// Returns the header name as a string slice. + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for HeaderName { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP header value. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct HeaderValue(pub(crate) Span); + +impl HeaderValue { + /// Returns the header value as a byte slice. + pub fn as_bytes(&self) -> &[u8] { + self.0.as_bytes() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for HeaderValue { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP header, including optional whitespace and the trailing CRLF. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Header { + pub(crate) span: Span, + /// The header name. + pub name: HeaderName, + /// The header value. + pub value: HeaderValue, +} + +impl Header { + /// Returns the indices of the header excluding the value. + /// + /// The indices will include any optional whitespace and the CRLF. + pub fn without_value(&self) -> RangeSet { + self.span.indices.difference(&self.value.span().indices) + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + self.name.offset(offset); + self.value.offset(offset); + } +} + +impl Spanned for Header { + fn span(&self) -> &Span { + &self.span + } +} + +/// An HTTP request method. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Method(pub(crate) Span); + +impl Method { + /// Returns the method as a string slice. + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for Method { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP request target. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Target(pub(crate) Span); + +impl Target { + /// Returns the target as a string slice. + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for Target { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP request line, including the trailing CRLF. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct RequestLine { + pub(crate) span: Span, + + /// The request method. + pub method: Method, + /// The request target. + pub target: Target, +} + +impl RequestLine { + /// Returns the indices of the request line excluding the request target. + pub fn without_target(&self) -> RangeSet { + self.span.indices.difference(&self.target.0.indices) + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + self.method.offset(offset); + self.target.offset(offset); + } +} + +impl Spanned for RequestLine { + fn span(&self) -> &Span { + &self.span + } +} + +/// An HTTP request. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Request { + pub(crate) span: Span, + /// The request line. + pub request: RequestLine, + /// Request headers. + pub headers: Vec
, + /// Request body. + pub body: Option, +} + +impl Request { + /// Returns an iterator of request headers with the given name (case-insensitive). + /// + /// This method returns an iterator because it is valid for HTTP records to contain + /// duplicate header names. + pub fn headers_with_name<'a>(&'a self, name: &'a str) -> impl Iterator { + self.headers + .iter() + .filter(|h| h.name.0.as_str().eq_ignore_ascii_case(name)) + } + + /// Returns the indices of the request excluding the target, headers and body. + pub fn without_data(&self) -> RangeSet { + let mut indices = self.span.indices.difference(&self.request.target.0.indices); + for header in &self.headers { + indices = indices.difference(header.span.indices()); + } + if let Some(body) = &self.body { + indices = indices.difference(body.span.indices()); + } + indices + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + self.request.offset(offset); + for header in &mut self.headers { + header.offset(offset); + } + if let Some(body) = &mut self.body { + body.offset(offset); + } + } +} + +impl Spanned for Request { + fn span(&self) -> &Span { + &self.span + } +} + +/// An HTTP response code. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Code(pub(crate) Span); + +impl Code { + /// Returns the response code as a string slice. + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for Code { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP response reason phrase. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Reason(pub(crate) Span); + +impl Reason { + /// Returns the response reason phrase as a string slice. + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.0.offset(offset); + } +} + +impl Spanned for Reason { + fn span(&self) -> &Span { + &self.0 + } +} + +/// An HTTP response status. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Status { + pub(crate) span: Span, + + /// The response code. + pub code: Code, + /// The reason phrase. + pub reason: Reason, +} + +impl Status { + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + self.code.offset(offset); + self.reason.offset(offset); + } +} + +impl Spanned for Status { + fn span(&self) -> &Span { + &self.span + } +} + +/// An HTTP response. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Response { + pub(crate) span: Span, + /// The response status. + pub status: Status, + /// Response headers. + pub headers: Vec
, + /// Response body. + pub body: Option, +} + +impl Response { + /// Returns an iterator of response headers with the given name (case-insensitive). + /// + /// This method returns an iterator because it is valid for HTTP records to contain + /// duplicate header names. + pub fn headers_with_name<'a>(&'a self, name: &'a str) -> impl Iterator { + self.headers + .iter() + .filter(|h| h.name.0.as_str().eq_ignore_ascii_case(name)) + } + + /// Returns the indices of the response excluding the headers and body. + pub fn without_data(&self) -> RangeSet { + let mut indices = self.span.indices.clone(); + for header in &self.headers { + indices = indices.difference(header.span.indices()); + } + if let Some(body) = &self.body { + indices = indices.difference(body.span.indices()); + } + indices + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + self.status.offset(offset); + for header in &mut self.headers { + header.offset(offset); + } + if let Some(body) = &mut self.body { + body.offset(offset); + } + } +} + +impl Spanned for Response { + fn span(&self) -> &Span { + &self.span + } +} + +/// An HTTP request or response body. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Body { + pub(crate) span: Span, +} + +impl Body { + /// Returns the body as a byte slice. + pub fn as_bytes(&self) -> &[u8] { + self.span.as_bytes() + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.span.offset(offset); + } +} + +impl Spanned for Body { + fn span(&self) -> &Span { + &self.span + } +} diff --git a/spansy/src/json/json.pest b/spansy/src/json/json.pest new file mode 100644 index 0000000..ae97782 --- /dev/null +++ b/spansy/src/json/json.pest @@ -0,0 +1,41 @@ +// pest. The Elegant Parser +// Copyright (c) 2018 DragoČ™ Tiselice +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + +//! A parser for JSON file. +//! +//! And this is a example for JSON parser. +json = _{ SOI ~ value ~ eoi } +eoi = _{ !ANY } + +/// Matches object, e.g.: `{ "foo": "bar" }` +/// Foobar +object = { "{" ~ pair ~ (pair)* ~ "}" | "{" ~ "}" } +pair = { quoted_string ~ ":" ~ value ~ (",")? } + +array = { "[" ~ value ~ ("," ~ value)* ~ "]" | "[" ~ "]" } + +////////////////////// +/// Matches value, e.g.: `"foo"`, `42`, `true`, `null`, `[]`, `{}`. +////////////////////// +value = _{ quoted_string | number | object | array | bool | null } + +quoted_string = _{ "\"" ~ string ~ "\"" } +string = @{ (!("\"" | "\\") ~ ANY)* ~ (escape ~ string)? } +escape = @{ "\\" ~ ("\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t" | unicode) } +unicode = @{ "u" ~ ASCII_HEX_DIGIT{4} } + +number = @{ "-"? ~ int ~ ("." ~ ASCII_DIGIT+ ~ exp? | exp)? } +int = @{ "0" | ASCII_NONZERO_DIGIT ~ ASCII_DIGIT* } +exp = @{ ("E" | "e") ~ ("+" | "-")? ~ ASCII_DIGIT+ } + +bool = { "true" | "false" } + +null = { "null" } + +WHITESPACE = _{ " " | "\t" | "\r" | "\n" } diff --git a/spansy/src/json/mod.rs b/spansy/src/json/mod.rs new file mode 100644 index 0000000..7801264 --- /dev/null +++ b/spansy/src/json/mod.rs @@ -0,0 +1,34 @@ +//! JSON span parsing. +//! +//! This module provides a JSON parser that can be used to parse span information for each JSON value within +//! a source string. +//! +//! Note that the parser does *not* fully parse values, it simply computes the span of the corresponding +//! characters in the source string. Thus, this parser should not be expected to perform any kind of +//! validation of the JSON. +//! +//! # Example +//! +//! ``` +//! use spansy::{json, Spanned}; +//! +//! let src = "{\"foo\": {\"bar\": [42, 14]}}"; +//! +//! let value = json::parse_str(src).unwrap(); +//! +//! // We can assert that the value present at the path "foo.bar.1" is the number 14. +//! assert_eq!(value.get("foo.bar.1").unwrap().span(), "14"); +//! +//! let bar = value.get("foo.bar").unwrap(); +//! +//! // The span of the `bar` array is 16..24 within the source string. +//! assert_eq!(bar.span().indices(), 16..24); +//! ``` + +mod span; +mod types; +mod visit; + +pub use span::{parse, parse_slice, parse_str}; +pub use types::{Array, Bool, JsonKey, JsonValue, KeyValue, Null, Number, Object, String}; +pub use visit::JsonVisit; diff --git a/spansy/src/json/span.rs b/spansy/src/json/span.rs new file mode 100644 index 0000000..521b7ab --- /dev/null +++ b/spansy/src/json/span.rs @@ -0,0 +1,171 @@ +use bytes::Bytes; +use pest::{iterators::Pair as PestPair, Parser}; +use types::KeyValue; + +use super::types::{self, JsonValue}; + +use crate::{ParseError, Span}; + +#[derive(pest_derive::Parser)] +#[grammar = "json/json.pest"] +struct JsonParser; + +/// Parse a JSON value from a source string. +pub fn parse_str(src: &str) -> Result { + let src = Bytes::copy_from_slice(src.as_bytes()); + + // # Safety + // `src` was passed as a string slice, so it is guaranteed to be valid UTF-8. + let src_str = unsafe { std::str::from_utf8_unchecked(src.as_ref()) }; + + let value = JsonParser::parse(Rule::value, src_str)? + .next() + .ok_or_else(|| ParseError("no json value is present in source".to_string()))?; + + // Since json.pest grammar prohibits leading characters but allows trailing + // characters, we prohibit trailing characters here. + if value.as_str().len() != src.len() { + return Err(ParseError( + "trailing characters are present in source".to_string(), + )); + } + + Ok(JsonValue::from_pair(src.clone(), value)) +} + +/// Parse a JSON value from a byte slice. +pub fn parse_slice(src: &[u8]) -> Result { + let src = Bytes::copy_from_slice(src); + parse(src) +} + +/// Parse a JSON value from source bytes. +pub fn parse(src: Bytes) -> Result { + let src_str = std::str::from_utf8(&src)?; + + let value = JsonParser::parse(Rule::value, src_str)? + .next() + .ok_or_else(|| ParseError("no json value is present in source".to_string()))?; + + // Since json.pest grammar prohibits leading characters but allows trailing + // characters, we prohibit trailing characters here. + if value.as_str().len() != src.len() { + return Err(ParseError( + "trailing characters are present in source".to_string(), + )); + } + + Ok(JsonValue::from_pair(src.clone(), value)) +} + +macro_rules! impl_from_pair { + ($ty:ty, $rule:ident) => { + impl $ty { + fn from_pair(src: Bytes, pair: PestPair<'_, Rule>) -> Self { + assert!(matches!(pair.as_rule(), Rule::$rule)); + + Self(Span::new_from_str(src, pair.as_str())) + } + } + }; +} + +impl_from_pair!(types::JsonKey, string); +impl_from_pair!(types::Number, number); +impl_from_pair!(types::Bool, bool); +impl_from_pair!(types::Null, null); +impl_from_pair!(types::String, string); + +impl types::KeyValue { + fn from_pair(src: Bytes, pair: PestPair<'_, Rule>) -> Self { + assert!(matches!(pair.as_rule(), Rule::pair)); + + let span = Span::new_from_str(src.clone(), pair.as_str().trim_end()); + + let mut pairs = pair.into_inner(); + + let key = pairs.next().expect("key is present"); + let value = pairs.next().expect("value is present"); + + Self { + span, + key: types::JsonKey::from_pair(src.clone(), key), + value: types::JsonValue::from_pair(src.clone(), value), + } + } +} + +impl types::Object { + fn from_pair(src: Bytes, pair: PestPair<'_, Rule>) -> Self { + assert!(matches!(pair.as_rule(), Rule::object)); + + Self { + span: Span::new_from_str(src.clone(), pair.as_str()), + elems: pair + .into_inner() + .map(|pair| KeyValue::from_pair(src.clone(), pair)) + .collect(), + } + } +} + +impl types::Array { + fn from_pair(src: Bytes, pair: PestPair<'_, Rule>) -> Self { + assert!(matches!(pair.as_rule(), Rule::array)); + + Self { + span: Span::new_from_str(src.clone(), pair.as_str()), + elems: pair + .into_inner() + .map(|pair| types::JsonValue::from_pair(src.clone(), pair)) + .collect(), + } + } +} + +impl types::JsonValue { + fn from_pair(src: Bytes, pair: PestPair<'_, Rule>) -> Self { + match pair.as_rule() { + Rule::object => Self::Object(types::Object::from_pair(src, pair)), + Rule::array => Self::Array(types::Array::from_pair(src, pair)), + Rule::string => Self::String(types::String::from_pair(src, pair)), + Rule::number => Self::Number(types::Number::from_pair(src, pair)), + Rule::bool => Self::Bool(types::Bool::from_pair(src, pair)), + Rule::null => Self::Null(types::Null::from_pair(src, pair)), + rule => unreachable!("unexpected matched rule: {:?}", rule), + } + } +} +#[cfg(test)] +mod tests { + use crate::Spanned; + + use super::*; + + #[test] + fn test_json_spanner() { + let src = r#"{"foo": "bar", "baz": 123, "quux": { "a": "b", "c": "d" }, "arr": [1, 2, 3]}"#; + + let value = parse_str(src).unwrap(); + + assert_eq!(value.get("foo").unwrap().span(), "bar"); + assert_eq!(value.get("baz").unwrap().span(), "123"); + assert_eq!(value.get("quux.a").unwrap().span(), "b"); + assert_eq!(value.get("arr").unwrap().span(), "[1, 2, 3]"); + } + + #[test] + fn test_err_leading_characters() { + let src = " {\"foo\": \"bar\"}"; + assert!(parse_str(src).is_err()); + } + + #[test] + fn test_err_trailing_characters() { + let src = "{\"foo\": \"bar\"} "; + assert_eq!( + parse_str(src).err().unwrap().to_string(), + "parsing error: trailing characters are present in source" + ); + } +} diff --git a/spansy/src/json/types.rs b/spansy/src/json/types.rs new file mode 100644 index 0000000..5ff16d0 --- /dev/null +++ b/spansy/src/json/types.rs @@ -0,0 +1,401 @@ +use std::ops::{Index, Range}; + +use utils::range::{RangeDifference, RangeSet}; + +use crate::{Span, Spanned}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A JSON value. +pub enum JsonValue { + /// A null value. + Null(Null), + /// A boolean value. + Bool(Bool), + /// A number value. + Number(Number), + /// A string value. + String(String), + /// An array value. + Array(Array), + /// An object value. + Object(Object), +} + +impl JsonValue { + /// Returns the span corresponding to the value. + pub fn into_span(self) -> Span { + match self { + JsonValue::Null(v) => v.0, + JsonValue::Bool(v) => v.0, + JsonValue::Number(v) => v.0, + JsonValue::String(v) => v.0, + JsonValue::Array(v) => v.span, + JsonValue::Object(v) => v.span, + } + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + match self { + JsonValue::Null(v) => v.0.offset(offset), + JsonValue::Bool(v) => v.0.offset(offset), + JsonValue::Number(v) => v.0.offset(offset), + JsonValue::String(v) => v.0.offset(offset), + JsonValue::Array(v) => { + v.span.offset(offset); + v.elems.iter_mut().for_each(|v| v.offset(offset)) + } + JsonValue::Object(v) => { + v.span.offset(offset); + v.elems.iter_mut().for_each(|kv| { + kv.span.offset(offset); + kv.key.offset(offset); + kv.value.offset(offset); + }) + } + } + } +} + +impl Spanned for JsonValue { + fn span(&self) -> &Span { + match self { + JsonValue::Null(v) => v.span(), + JsonValue::Bool(v) => v.span(), + JsonValue::Number(v) => v.span(), + JsonValue::String(v) => v.span(), + JsonValue::Array(v) => v.span(), + JsonValue::Object(v) => v.span(), + } + } +} + +impl JsonValue { + /// Get a reference to the value using the given path. + /// + /// # Example + /// + /// ``` + /// use spansy::json::parse_str; + /// use spansy::Spanned; + /// + /// let src = "{\"foo\": {\"bar\": [42, 14]}}"; + /// + /// let value = parse_str(src).unwrap(); + /// + /// assert_eq!(value.get("foo.bar.1").unwrap().span(), "14"); + /// ``` + pub fn get(&self, path: &str) -> Option<&JsonValue> { + match self { + JsonValue::Null(_) => None, + JsonValue::Bool(_) => None, + JsonValue::Number(_) => None, + JsonValue::String(_) => None, + JsonValue::Array(v) => v.get(path), + JsonValue::Object(v) => v.get(path), + } + } +} + +/// A key value pair in a JSON object. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct KeyValue { + pub(crate) span: Span, + + /// The key of the pair. + pub key: JsonKey, + /// The value of the pair. + pub value: JsonValue, +} + +impl KeyValue { + /// Returns the indices of the key value pair, excluding the value. + pub fn without_value(&self) -> RangeSet { + self.span.indices.difference(&self.value.span().indices) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A key in a JSON object. +pub struct JsonKey(pub(crate) Span); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A null value. +pub struct Null(pub(crate) Span); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A boolean value. +pub struct Bool(pub(crate) Span); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A number value. +pub struct Number(pub(crate) Span); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A string value. +pub struct String(pub(crate) Span); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// An array value. +pub struct Array { + pub(crate) span: Span, + /// The elements of the array. + pub elems: Vec, +} + +impl Array { + /// Get a reference to the value using the given path. + pub fn get(&self, path: &str) -> Option<&JsonValue> { + let mut path_iter = path.split('.'); + + let key = path_iter.next()?; + let idx = key.parse::().ok()?; + + let value = self.elems.get(idx)?; + + if path_iter.next().is_some() { + value.get(&path[key.len() + 1..]) + } else { + Some(value) + } + } + + /// Returns the indices of the array, excluding the values and separators. + pub fn without_values(&self) -> RangeSet { + let start = self + .span + .indices + .min() + .expect("array has at least brackets"); + let end = self + .span + .indices + .max() + .expect("array has at least brackets"); + + RangeSet::from([start..start + 1, end..end + 1]) + } +} + +impl Index for Array { + type Output = JsonValue; + + /// Returns the value at the given index of the array. + /// + /// # Panics + /// + /// Panics if the index is out of bounds. + fn index(&self, index: usize) -> &Self::Output { + self.elems.get(index).expect("index is in bounds") + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +/// A JSON object value. +pub struct Object { + pub(crate) span: Span, + /// The key value pairs of the object. + pub elems: Vec, +} + +impl Object { + /// Get a reference to the value using the given path. + pub fn get(&self, path: &str) -> Option<&JsonValue> { + let mut path_iter = path.split('.'); + + let key = path_iter.next()?; + + let KeyValue { value, .. } = self.elems.iter().find(|kv| kv.key == key)?; + + if path_iter.next().is_some() { + value.get(&path[key.len() + 1..]) + } else { + Some(value) + } + } + + /// Returns the indices of the object, excluding the key value pairs. + pub fn without_pairs(&self) -> RangeSet { + let mut indices = self.span.indices.clone(); + for kv in &self.elems { + indices = indices.difference(&kv.span.indices); + } + indices + } +} + +impl Index<&str> for Object { + type Output = JsonValue; + + /// Returns the value at the given key of the object. + /// + /// # Panics + /// + /// Panics if the key is not present. + fn index(&self, key: &str) -> &Self::Output { + self.get(key).expect("key is present") + } +} + +macro_rules! impl_type { + ($ty:ident, $span:tt) => { + impl $ty { + /// Returns the span corresponding to the value. + pub fn into_span(self) -> Span { + self.$span + } + + /// Shifts the span range by the given offset. + pub fn offset(&mut self, offset: usize) { + self.$span.offset(offset); + } + } + + impl Spanned for $ty { + fn span(&self) -> &Span { + &self.$span + } + } + + impl PartialEq for $ty { + fn eq(&self, other: &str) -> bool { + self.$span == other + } + } + + impl PartialEq<$ty> for str { + fn eq(&self, other: &$ty) -> bool { + self == &other.$span + } + } + + impl PartialEq<&str> for $ty { + fn eq(&self, other: &&str) -> bool { + self.$span == *other + } + } + + impl PartialEq<$ty> for &str { + fn eq(&self, other: &$ty) -> bool { + *self == &other.$span + } + } + + impl PartialEq> for $ty { + fn eq(&self, other: &Range) -> bool { + &self.$span == other + } + } + + impl PartialEq<$ty> for Range { + fn eq(&self, other: &$ty) -> bool { + self == &other.$span + } + } + + impl PartialEq> for $ty { + fn eq(&self, other: &Span) -> bool { + &self.$span == other + } + } + + impl PartialEq<$ty> for Span { + fn eq(&self, other: &$ty) -> bool { + self == &other.$span + } + } + }; +} + +impl_type!(JsonKey, 0); +impl_type!(Null, 0); +impl_type!(Bool, 0); +impl_type!(Number, 0); +impl_type!(String, 0); +impl_type!(Array, span); +impl_type!(Object, span); +impl_type!(KeyValue, span); + +#[cfg(test)] +mod tests { + use utils::range::IndexRanges; + + use crate::json::parse_str; + + use super::*; + + #[test] + fn test_obj_index() { + let src = "{\"foo\": \"bar\"}"; + + let value = parse_str(src).unwrap(); + + assert_eq!(value.get("foo").unwrap().span(), "bar"); + } + + #[test] + fn test_array_index() { + let src = "{\"foo\": [42, 14]}"; + + let value = parse_str(src).unwrap(); + + assert_eq!(value.get("foo.1").unwrap().span(), "14"); + } + + #[test] + fn test_nested_index() { + let src = "{\"foo\": {\"bar\": [42, 14]}}"; + + let value = parse_str(src).unwrap(); + + assert_eq!(value.get("foo.bar.1").unwrap().span(), "14"); + } + + #[test] + fn test_key_value_without_value() { + let src = "{\"foo\": \"bar\"\n}"; + + let JsonValue::Object(value) = parse_str(src).unwrap() else { + panic!("expected object"); + }; + + let indices = value.elems[0].without_value(); + + assert_eq!(src.index_ranges(&indices), "\"foo\": \"\""); + } + + #[test] + fn test_array_without_values() { + let src = "[42, 14]"; + + let JsonValue::Array(value) = parse_str(src).unwrap() else { + panic!("expected object"); + }; + + let indices = value.without_values(); + + assert_eq!(src.index_ranges(&indices), "[]"); + } + + #[test] + fn test_object_without_pairs() { + let src = "{\"foo\": \"bar\"\n}"; + + let JsonValue::Object(value) = parse_str(src).unwrap() else { + panic!("expected object"); + }; + + let indices = value.without_pairs(); + + assert_eq!(src.index_ranges(&indices), "{\n}"); + } +} diff --git a/spansy/src/json/visit.rs b/spansy/src/json/visit.rs new file mode 100644 index 0000000..90dcc22 --- /dev/null +++ b/spansy/src/json/visit.rs @@ -0,0 +1,84 @@ +use super::{types, types::JsonValue}; + +/// A visitor for JSON values. +/// +/// # Example +/// +/// ``` +/// use spansy::json::{parse_str, Number, JsonVisit}; +/// use spansy::Spanned; +/// +/// struct DigitReplacer<'a, 'b> { +/// src: &'a mut String, +/// digit: &'b str, +/// } +/// +/// impl<'a> JsonVisit for DigitReplacer<'a, '_> { +/// fn visit_number(&mut self, node: &Number) { +/// let span = node.span(); +/// for range in span.indices().iter_ranges() { +/// let replacement = self.digit.repeat(range.len()); +/// self.src.replace_range(range, &replacement); +/// } +/// } +/// } +/// +/// let src = "{\"foo\": [42, 69]}"; +/// +/// let value = parse_str(src).unwrap(); +/// +/// let mut new = src.to_string(); +/// +/// // Replace the digits of all numbers with 9. +/// DigitReplacer { src: &mut new, digit: "9" }.visit_value(&value); +/// +/// assert_eq!(new, "{\"foo\": [99, 99]}"); +/// ``` +pub trait JsonVisit { + /// Visit a key value pair in a JSON object. + fn visit_key_value(&mut self, node: &types::KeyValue) { + self.visit_key(&node.key); + self.visit_value(&node.value); + } + + /// Visit a key in a JSON object. + fn visit_key(&mut self, _node: &types::JsonKey) {} + + /// Visit a JSON value. + fn visit_value(&mut self, node: &JsonValue) { + match node { + JsonValue::Null(value) => self.visit_null(value), + JsonValue::Bool(value) => self.visit_bool(value), + JsonValue::Number(value) => self.visit_number(value), + JsonValue::String(value) => self.visit_string(value), + JsonValue::Array(value) => self.visit_array(value), + JsonValue::Object(value) => self.visit_object(value), + } + } + + /// Visit an array value. + fn visit_array(&mut self, node: &types::Array) { + for elem in &node.elems { + self.visit_value(elem); + } + } + + /// Visit an object value. + fn visit_object(&mut self, node: &types::Object) { + for kv in &node.elems { + self.visit_key_value(kv); + } + } + + /// Visit a null value. + fn visit_null(&mut self, _node: &types::Null) {} + + /// Visit a boolean value. + fn visit_bool(&mut self, _node: &types::Bool) {} + + /// Visit a number value. + fn visit_number(&mut self, _node: &types::Number) {} + + /// Visit a string value. + fn visit_string(&mut self, _node: &types::String) {} +} diff --git a/spansy/src/lib.rs b/spansy/src/lib.rs new file mode 100644 index 0000000..1fb96de --- /dev/null +++ b/spansy/src/lib.rs @@ -0,0 +1,305 @@ +//! Parsing span information. + +#![deny(missing_docs, unreachable_pub, unused_must_use)] +#![deny(clippy::all)] + +use std::{fmt::Debug, marker::PhantomData, ops::Range}; + +use bytes::Bytes; + +pub(crate) mod helpers; +pub mod http; +pub mod json; + +use utils::range::RangeSet; + +/// A parsing error. +#[derive(Debug, thiserror::Error)] +#[error("parsing error: {0}")] +pub struct ParseError(String); + +impl From> for ParseError { + fn from(value: pest::error::Error) -> Self { + Self(value.to_string()) + } +} + +impl From for ParseError { + fn from(value: std::str::Utf8Error) -> Self { + Self(value.to_string()) + } +} + +/// A spanned value. +pub trait Spanned { + /// Get a reference to the span of the value. + fn span(&self) -> &Span; +} + +/// A span of a source string. +#[derive(PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Span { + /// The original source bytes from when the span was parsed. + pub(crate) data: Bytes, + /// The set of indices within the source data. + pub(crate) indices: RangeSet, + _pd: PhantomData, +} + +impl Clone for Span<[u8]> { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + indices: self.indices.clone(), + _pd: PhantomData, + } + } +} + +impl Clone for Span { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + indices: self.indices.clone(), + _pd: PhantomData, + } + } +} + +impl Debug for Span<[u8]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Span") + .field("span", &self.as_bytes()) + .field("indices", &self.indices) + .finish() + } +} + +impl Debug for Span { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Span") + .field("span", &self.as_str()) + .field("indices", &self.indices) + .finish() + } +} + +impl Span { + /// Returns a reference to the span data. + pub fn data(&self) -> &[u8] { + self.data.as_ref() + } + + /// Converts the span into bytes. + pub fn to_bytes(self) -> Bytes { + self.data + } + + /// Returns the indices within the source data. + pub fn indices(&self) -> &RangeSet { + &self.indices + } + + /// Returns the length of the span in bytes. + /// + /// Just like `str::len()`, this is not necessarily the number of characters. + pub fn len(&self) -> usize { + self.indices.len() + } + + /// Returns `true` if the span is empty. + pub fn is_empty(&self) -> bool { + self.indices.is_empty() + } + + /// Shifts the span indices by the given offset. + /// + /// # Panics + /// + /// Panics if the offset causes the indices to overflow `usize::MAX`. + pub fn offset(&mut self, offset: usize) { + self.indices.shift_right(&offset); + } +} + +impl Span { + /// Create a new string span. + /// + /// # Panics + /// + /// Panics if the given range is not within the source bytes, or + /// if the span is not a valid UTF-8 string. + pub(crate) fn new_str(src: Bytes, range: Range) -> Self { + assert!( + std::str::from_utf8(&src[range.clone()]).is_ok(), + "span is not a valid UTF-8 string" + ); + + Self { + data: src.slice(range.clone()), + indices: range.into(), + _pd: PhantomData, + } + } + + /// Create a new string span from a string slice. + /// + /// # Panics + /// + /// Panics if the given slice is not within the source bytes. + pub(crate) fn new_from_str(src: Bytes, span: &str) -> Self { + let range = helpers::get_span_range(src.as_ref(), span.as_bytes()); + + Self { + data: src.slice(range.clone()), + indices: range.into(), + _pd: PhantomData, + } + } + + /// Converts this type to a string slice. + pub fn as_str(&self) -> &str { + self.as_ref() + } + + /// Returns the corresponding byte span. + pub fn to_byte_span(&self) -> Span<[u8]> { + self.into() + } +} + +impl AsRef for Span { + fn as_ref(&self) -> &str { + // # Safety + // The span is guaranteed to be a valid UTF-8 string because it is not + // possible to create a `Span` from a non-UTF-8 string. + unsafe { std::str::from_utf8_unchecked(&self.data) } + } +} + +impl AsRef<[u8]> for Span { + fn as_ref(&self) -> &[u8] { + self.data.as_ref() + } +} + +impl Span<[u8]> { + /// Create a new byte span. + /// + /// # Panics + /// + /// Panics if the given range is not within the source bytes. + pub(crate) fn new_bytes(src: Bytes, range: Range) -> Self { + assert!(src.len() >= range.end, "span is not within source bytes"); + + Self { + data: src.slice(range.clone()), + indices: range.into(), + _pd: PhantomData, + } + } + + /// Converts this type to a byte slice. + pub fn as_bytes(&self) -> &[u8] { + self.as_ref() + } +} + +impl AsRef<[u8]> for Span<[u8]> { + fn as_ref(&self) -> &[u8] { + self.data.as_ref() + } +} + +impl From> for Span<[u8]> { + fn from(span: Span) -> Self { + Self { + data: span.data, + indices: span.indices, + _pd: PhantomData, + } + } +} + +impl From<&Span> for Span<[u8]> { + fn from(span: &Span) -> Self { + Self { + data: span.data.clone(), + indices: span.indices.clone(), + _pd: PhantomData, + } + } +} + +impl PartialEq for [u8] { + fn eq(&self, other: &Span) -> bool { + self == other.as_ref() + } +} + +impl PartialEq<[u8]> for Span { + fn eq(&self, other: &[u8]) -> bool { + self.as_ref() == other + } +} + +impl PartialEq<&[u8]> for Span { + fn eq(&self, other: &&[u8]) -> bool { + self.as_ref() == *other + } +} + +impl PartialEq<[u8]> for &Span { + fn eq(&self, other: &[u8]) -> bool { + self.as_ref() == other + } +} + +impl PartialEq> for str { + fn eq(&self, other: &Span) -> bool { + self == other.as_str() + } +} + +impl PartialEq for Span { + fn eq(&self, other: &str) -> bool { + self.as_str() == other + } +} + +impl PartialEq<&str> for Span { + fn eq(&self, other: &&str) -> bool { + self.as_str() == *other + } +} + +impl PartialEq for &Span { + fn eq(&self, other: &str) -> bool { + self.as_str() == other + } +} + +impl PartialEq> for Span { + fn eq(&self, other: &Range) -> bool { + &self.indices == other + } +} + +impl PartialEq> for Range { + fn eq(&self, other: &Span) -> bool { + other == self + } +} + +impl PartialEq> for &Span { + fn eq(&self, other: &Range) -> bool { + *self == other + } +} + +impl PartialEq> for &Range { + fn eq(&self, other: &Span) -> bool { + other == *self + } +}