Improve string indexing, codePointAt

This commit is contained in:
Andrew Morris
2023-03-13 12:52:33 +11:00
parent b6045b24c1
commit d33d93bd34
3 changed files with 110 additions and 15 deletions

View File

@@ -9,7 +9,7 @@ pub fn op_sub_string(string_data: &Rc<String>, subscript: &Val) -> Val {
let method_str = method.as_str();
return match method_str {
"length" => Val::Number(string_data.len() as f64),
"length" => Val::Number(string_data.as_bytes().len() as f64),
_ => get_string_method(method_str),
};
}
@@ -22,7 +22,10 @@ pub fn op_sub_string(string_data: &Rc<String>, subscript: &Val) -> Val {
return Val::Undefined;
}
string_from_byte(string_bytes[right_index])
match unicode_at(string_bytes, right_index) {
Some(string) => Val::String(Rc::new(string)),
None => Val::String(Rc::new(String::from(""))),
}
}
pub fn get_string_method(method: &str) -> Val {
@@ -30,6 +33,7 @@ pub fn get_string_method(method: &str) -> Val {
"at" => Val::Static(&AT),
"charAt" => Val::Static(&CHAR_AT),
"charCodeAt" => Val::Static(&CHAR_CODE_AT),
"codePointAt" => Val::Static(&CODE_POINT_AT),
_ => Val::Undefined,
}
}
@@ -45,9 +49,10 @@ static AT: NativeFunction = NativeFunction {
Some(i) => i,
};
let byte = string_bytes[index];
Val::String(Rc::new(String::from_utf8_lossy(&[byte]).into_owned()))
match unicode_at(string_bytes, index) {
Some(string) => Val::String(Rc::new(string)),
None => Val::String(Rc::new(String::from(""))),
}
}
_ => std::panic!("Not implemented: exceptions/string indirection"),
}
@@ -58,7 +63,10 @@ static CHAR_AT: NativeFunction = NativeFunction {
fn_: |this: &mut Val, params: Vec<Val>| -> Val {
match this {
Val::String(string_data) => match byte_at(string_data, params.get(0)) {
Some(byte) => string_from_byte(byte),
Some(byte) => match std::char::from_u32(byte as u32) {
Some(c) => Val::String(Rc::new(c.to_string())),
None => Val::String(Rc::new(String::from(""))),
},
None => Val::String(Rc::new(String::from(""))),
},
_ => std::panic!("Not implemented: exceptions/string indirection"),
@@ -78,6 +86,73 @@ static CHAR_CODE_AT: NativeFunction = NativeFunction {
},
};
static CODE_POINT_AT: NativeFunction = NativeFunction {
fn_: |this: &mut Val, params: Vec<Val>| -> Val {
match this {
Val::String(string_data) => {
let string_bytes = string_data.as_bytes();
let index = match params.get(0) {
Some(i) => match i.to_index() {
None => return Val::Undefined,
Some(i) => i,
},
_ => return Val::Undefined,
};
match code_point_at(string_bytes, index) {
Some(code_point) => Val::Number(code_point as f64),
None => Val::Undefined,
}
}
_ => std::panic!("Not implemented: exceptions/string indirection"),
}
},
};
fn unicode_at(bytes: &[u8], index: usize) -> Option<String> {
match code_point_at(bytes, index) {
Some(code_point) => Some(
std::char::from_u32(code_point)
.expect("Invalid code point") // TODO: Find out if this is reachable and what to do about it
.to_string(),
),
None => None,
}
}
fn code_point_at(bytes: &[u8], index: usize) -> Option<u32> {
if index >= bytes.len() {
return None;
}
let byte = bytes[index];
let leading_ones = byte.leading_ones() as usize;
if leading_ones == 0 {
return Some(byte as u32);
}
if leading_ones == 1 || leading_ones > 4 || index + leading_ones > bytes.len() {
return None;
}
let mut value = (byte & (0x7F >> leading_ones)) as u32;
for i in 1..leading_ones {
let next_byte = bytes[index + i];
if next_byte.leading_ones() != 1 {
return None;
}
value = (value << 6) | (next_byte & 0x3F) as u32;
}
Some(value)
}
fn byte_at(string: &String, index_param: Option<&Val>) -> Option<u8> {
let mut index = match index_param {
Some(i) => i.to_number(),
@@ -104,12 +179,3 @@ fn byte_at(string: &String, index_param: Option<&Val>) -> Option<u8> {
Some(string_bytes[index_usize])
}
fn string_from_byte(byte: u8) -> Val {
// TODO: Val::Strings need to change to not use rust's string type,
// because they need to represent an actual byte array underneath. This
// occurs for invalid utf8 sequences which are getting converted to U+FFFD
// here. To be analogous to js, the information of the actual byte needs
// to be preserved, but that can't be represented in rust's string type.
Val::String(Rc::new(String::from_utf8_lossy(&[byte]).into_owned()))
}