Source code

Revision control

Copy as Markdown

Other Tools

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
#[cfg(feature = "serde")]
use alloc::format;
#[cfg(feature = "serde")]
use alloc::string::String;
use alloc::vec::Vec;
use core::{char, ops::RangeBounds, ops::RangeInclusive};
use yoke::Yokeable;
use zerofrom::ZeroFrom;
use zerovec::{ule::AsULE, zerovec, ZeroVec};
use super::CodePointInversionListError;
use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv};
/// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive
const BMP_MAX: u32 = 0xFFFF;
/// Represents the inversion list for a set of all code points in the Basic Multilingual Plane.
const BMP_INV_LIST_VEC: ZeroVec<u32> =
zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, BMP_MAX + 1]);
/// Represents the inversion list for all of the code points in the Unicode range.
const ALL_VEC: ZeroVec<u32> =
zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, (char::MAX as u32) + 1]);
/// A membership wrapper for [`CodePointInversionList`].
///
/// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points)
/// and predefined ranges.
#[zerovec::make_varule(CodePointInversionListULE)]
#[zerovec::skip_derive(Ord)]
#[zerovec::derive(Debug)]
#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
pub struct CodePointInversionList<'data> {
// If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature
// Allows for traits of fixed size arrays
// Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list)
inv_list: ZeroVec<'data, u32>,
size: u32,
}
#[cfg(feature = "serde")]
impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::de::Error;
let parsed_inv_list = if deserializer.is_human_readable() {
#[derive(serde::Deserialize)]
#[serde(untagged)]
pub enum De<'data> {
// TODO(#2856): Remove in ICU4X 2.0
#[serde(borrow)]
OldStyle(ZeroVec<'data, u32>),
#[serde(borrow)]
NewStyle(Vec<alloc::borrow::Cow<'data, str>>),
}
match De::<'de>::deserialize(deserializer)? {
De::OldStyle(parsed_inv_list) => parsed_inv_list,
De::NewStyle(parsed_strings) => {
let mut inv_list =
ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2));
for range in parsed_strings {
fn internal(range: &str) -> Option<(u32, u32)> {
let (start, range) = UnicodeCodePoint::parse(range)?;
if range.is_empty() {
return Some((start.0, start.0));
}
let (hyphen, range) = UnicodeCodePoint::parse(range)?;
if hyphen.0 != '-' as u32 {
return None;
}
let (end, range) = UnicodeCodePoint::parse(range)?;
range.is_empty().then_some((start.0, end.0))
}
let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!(
"Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}"
)))?;
inv_list.with_mut(|v| {
v.push(start.to_unaligned());
v.push((end + 1).to_unaligned());
});
}
inv_list
}
}
} else {
ZeroVec::<u32>::deserialize(deserializer)?
};
CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| {
Error::custom(format!(
"Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}"
))
})
}
}
#[cfg(feature = "databake")]
impl databake::Bake for CodePointInversionList<'_> {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
env.insert("icu_collections");
let inv_list = self.inv_list.bake(env);
let size = self.size.bake(env);
// Safe because our parts are safe.
databake::quote! { unsafe {
#[allow(unused_unsafe)]
icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size)
}}
}
}
#[cfg(feature = "serde")]
#[derive(Debug, Copy, Clone)]
struct UnicodeCodePoint(u32);
#[cfg(feature = "serde")]
impl UnicodeCodePoint {
fn from_u32(cp: u32) -> Result<Self, String> {
if cp <= char::MAX as u32 {
Ok(Self(cp))
} else {
Err(format!("Not a Unicode code point {}", cp))
}
}
fn parse(value: &str) -> Option<(Self, &str)> {
Some(if let Some(hex) = value.strip_prefix("U+") {
let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?);
(Self(u32::from_str_radix(escape, 16).ok()?), remainder)
} else {
let c = value.chars().next()?;
(Self(c as u32), value.get(c.len_utf8()..)?)
})
}
}
#[cfg(feature = "serde")]
impl core::fmt::Display for UnicodeCodePoint {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.0 {
s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"),
// SAFETY: c <= char::MAX by construction, and not a surrogate
c => write!(f, "{}", unsafe { char::from_u32_unchecked(c) }),
}
}
}
#[cfg(feature = "serde")]
impl<'data> serde::Serialize for CodePointInversionList<'data> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
use serde::ser::Error;
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?;
for range in self.iter_ranges() {
let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?;
if range.start() == range.end() {
seq.serialize_element(&format!("{start}"))?;
} else {
let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?;
seq.serialize_element(&format!("{start}-{end}",))?;
}
}
seq.end()
} else {
// Note: serde(flatten) currently does not promote a struct field of type Vec
// to replace the struct when serializing. The error message from the default
// serialization is: "can only flatten structs and maps (got a sequence)".
self.inv_list.serialize(serializer)
}
}
}
impl<'data> CodePointInversionList<'data> {
/// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
/// represented as a [`ZeroVec`]`<`[`u32`]`>` of code points.
///
/// The inversion list must be of even length, sorted ascending non-overlapping,
/// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use icu::collections::codepointinvlist::CodePointInversionListError;
/// use zerovec::ZeroVec;
/// let valid = [0x0, 0x10000];
/// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&valid);
/// let result = CodePointInversionList::try_from_inversion_list(inv_list);
/// assert!(matches!(result, CodePointInversionList));
///
/// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
/// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&invalid);
/// let result = CodePointInversionList::try_from_inversion_list(inv_list);
/// assert!(matches!(
/// result,
/// Err(CodePointInversionListError::InvalidSet(_))
/// ));
/// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
/// assert_eq!(&invalid, &actual);
/// }
/// ```
pub fn try_from_inversion_list(
inv_list: ZeroVec<'data, u32>,
) -> Result<Self, CodePointInversionListError> {
#[allow(clippy::indexing_slicing)] // chunks
if is_valid_zv(&inv_list) {
let size = inv_list
.as_ule_slice()
.chunks(2)
.map(|end_points| {
<u32 as AsULE>::from_unaligned(end_points[1])
- <u32 as AsULE>::from_unaligned(end_points[0])
})
.sum::<u32>();
Ok(Self { inv_list, size })
} else {
Err(CodePointInversionListError::InvalidSet(inv_list.to_vec()))
}
}
#[doc(hidden)] // databake internal
pub const unsafe fn from_parts_unchecked(inv_list: ZeroVec<'data, u32>, size: u32) -> Self {
Self { inv_list, size }
}
/// Returns a new [`CodePointInversionList`] by borrowing an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
/// represented as a slice of [`u32`] code points.
///
/// The inversion list must be of even length, sorted ascending non-overlapping,
/// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
///
/// Note: The slice may be cloned on certain platforms; for more information, see [`ZeroVec::from_slice_or_alloc`].
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use icu::collections::codepointinvlist::CodePointInversionListError;
/// let valid = [0x0, 0x10000];
/// let result = CodePointInversionList::try_from_inversion_list_slice(&valid);
/// assert!(matches!(result, CodePointInversionList));
///
/// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
/// let result =
/// CodePointInversionList::try_from_inversion_list_slice(&invalid);
/// assert!(matches!(
/// result,
/// Err(CodePointInversionListError::InvalidSet(_))
/// ));
/// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
/// assert_eq!(&invalid, &actual);
/// }
/// ```
pub fn try_from_inversion_list_slice(
inv_list: &'data [u32],
) -> Result<Self, CodePointInversionListError> {
let inv_list_zv: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(inv_list);
CodePointInversionList::try_from_inversion_list(inv_list_zv)
}
/// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
/// represented as a slice of [`u32`] code points.
///
/// The inversion list must be of even length, sorted ascending non-overlapping,
/// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
///
/// let bmp_list = &[0x0, 0x10000];
/// let smp_list = &[0x10000, 0x20000];
/// let sip_list = &[0x20000, 0x30000];
///
/// let lists: Vec<CodePointInversionList> =
/// [&bmp_list[..], smp_list, sip_list]
/// .into_iter()
/// .map(|l| {
/// CodePointInversionList::try_clone_from_inversion_list_slice(l)
/// .unwrap()
/// })
/// .collect();
///
/// let bmp = &lists[0];
/// assert!(bmp.contains32(0xFFFF));
/// assert!(!bmp.contains32(0x10000));
///
/// assert!(!lists.iter().any(|set| set.contains32(0x40000)));
/// ```
pub fn try_clone_from_inversion_list_slice(
inv_list: &[u32],
) -> Result<Self, CodePointInversionListError> {
let inv_list_zv: ZeroVec<u32> = ZeroVec::alloc_from_slice(inv_list);
CodePointInversionList::try_from_inversion_list(inv_list_zv)
}
/// Attempts to convert this list into a fully-owned one. No-op if already fully owned
pub fn into_owned(self) -> CodePointInversionList<'static> {
CodePointInversionList {
inv_list: self.inv_list.into_owned(),
size: self.size,
}
}
/// Returns an owned inversion list representing the current [`CodePointInversionList`]
pub fn get_inversion_list_vec(&self) -> Vec<u32> {
let result: Vec<u32> = self.as_inversion_list().to_vec(); // Only crate public, to not leak impl
result
}
/// Returns [`CodePointInversionList`] spanning entire Unicode range
///
/// The range spans from `0x0 -> 0x10FFFF` inclusive.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
///
/// let expected = [0x0, (char::MAX as u32) + 1];
/// assert_eq!(
/// CodePointInversionList::all().get_inversion_list_vec(),
/// expected
/// );
/// assert_eq!(
/// CodePointInversionList::all().size(),
/// (expected[1] - expected[0]) as usize
/// );
/// ```
pub fn all() -> Self {
Self {
inv_list: ALL_VEC,
size: (char::MAX as u32) + 1,
}
}
/// Returns [`CodePointInversionList`] spanning BMP range
///
/// The range spans from `0x0 -> 0xFFFF` inclusive.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
///
/// const BMP_MAX: u32 = 0xFFFF;
///
/// let expected = [0x0, BMP_MAX + 1];
/// assert_eq!(
/// CodePointInversionList::bmp().get_inversion_list_vec(),
/// expected
/// );
/// assert_eq!(
/// CodePointInversionList::bmp().size(),
/// (expected[1] - expected[0]) as usize
/// );
/// ```
pub fn bmp() -> Self {
Self {
inv_list: BMP_INV_LIST_VEC,
size: BMP_MAX + 1,
}
}
/// Returns the inversion list as a slice
///
/// Public only to the crate, not exposed to public
pub(crate) fn as_inversion_list(&self) -> &ZeroVec<u32> {
&self.inv_list
}
/// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x44, 0x45, 0x46];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// let mut ex_iter_chars = example.iter_chars();
/// assert_eq!(Some('A'), ex_iter_chars.next());
/// assert_eq!(Some('B'), ex_iter_chars.next());
/// assert_eq!(Some('C'), ex_iter_chars.next());
/// assert_eq!(Some('E'), ex_iter_chars.next());
/// assert_eq!(None, ex_iter_chars.next());
/// ```
pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ {
#[allow(clippy::indexing_slicing)] // chunks
self.inv_list
.as_ule_slice()
.chunks(2)
.flat_map(|pair| (AsULE::from_unaligned(pair[0])..AsULE::from_unaligned(pair[1])))
.filter_map(char::from_u32)
}
/// Yields an [`Iterator`] returning the ranges of the code points that are
/// included in the [`CodePointInversionList`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x44, 0x45, 0x46];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// let mut example_iter_ranges = example.iter_ranges();
/// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next());
/// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next());
/// assert_eq!(None, example_iter_ranges.next());
/// ```
pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ {
#[allow(clippy::indexing_slicing)] // chunks
self.inv_list.as_ule_slice().chunks(2).map(|pair| {
let range_start: u32 = AsULE::from_unaligned(pair[0]);
let range_limit: u32 = AsULE::from_unaligned(pair[1]);
RangeInclusive::new(range_start, range_limit - 1)
})
}
/// Yields an [`Iterator`] returning the ranges of the code points that are
/// *not* included in the [`CodePointInversionList`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x44, 0x45, 0x46];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// let mut example_iter_ranges = example.iter_ranges_complemented();
/// assert_eq!(Some(0..=0x40), example_iter_ranges.next());
/// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next());
/// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next());
/// assert_eq!(None, example_iter_ranges.next());
/// ```
pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
let inv_ule = self.inv_list.as_ule_slice();
let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]);
let beginning = if let Some(first) = self.inv_list.first() {
if first == 0 {
None
} else {
Some(0..=first - 1)
}
} else {
None
};
let end = if let Some(last) = self.inv_list.last() {
if last == char::MAX as u32 {
None
} else {
Some(last..=char::MAX as u32)
}
} else {
None
};
#[allow(clippy::indexing_slicing)] // chunks
let chunks = middle.chunks(2).map(|pair| {
let range_start: u32 = AsULE::from_unaligned(pair[0]);
let range_limit: u32 = AsULE::from_unaligned(pair[1]);
RangeInclusive::new(range_start, range_limit - 1)
});
beginning.into_iter().chain(chunks).chain(end)
}
/// Returns the number of ranges contained in this [`CodePointInversionList`]
pub fn get_range_count(&self) -> usize {
self.inv_list.len() / 2
}
/// Returns a specific range contained in this [`CodePointInversionList`] by index.
/// Intended for use in FFI.
pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> {
let start_idx = idx * 2;
let end_idx = start_idx + 1;
let start = self.inv_list.get(start_idx)?;
let end = self.inv_list.get(end_idx)?;
Some(RangeInclusive::new(start, end - 1))
}
/// Returns the number of elements of the [`CodePointInversionList`]
pub fn size(&self) -> usize {
if self.is_empty() {
return 0;
}
self.size as usize
}
/// Returns whether or not the [`CodePointInversionList`] is empty
pub fn is_empty(&self) -> bool {
self.inv_list.is_empty()
}
/// Wrapper for contains
///
/// Returns an [`Option`] as to whether or not it is possible for the query to be contained.
/// The value in the [`Option`] is the start index of the range that contains the query.
fn contains_query(&self, query: u32) -> Option<usize> {
match self.inv_list.binary_search(&query) {
Ok(pos) => {
if pos % 2 == 0 {
Some(pos)
} else {
None
}
}
Err(pos) => {
if pos % 2 != 0 && pos < self.inv_list.len() {
Some(pos - 1)
} else {
None
}
}
}
}
/// Checks to see the query is in the [`CodePointInversionList`]
///
/// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
/// in the set using [`core`] implementation
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x43, 0x44, 0x45];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert!(example.contains('A'));
/// assert!(!example.contains('C'));
/// ```
pub fn contains(&self, query: char) -> bool {
self.contains_query(query as u32).is_some()
}
/// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all())
///
/// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
/// values, there is an important difference. A [`u32`] can take values up to
/// a very large integer value, while a [`prim@char`] in Rust is defined to be in
/// the range from 0 to the maximum valid Unicode Scalar Value.
///
/// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
/// in the set using [`core`] implementation
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x43, 0x44, 0x45];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert!(example.contains32(0x41));
/// assert!(!example.contains32(0x43));
/// ```
pub fn contains32(&self, query: u32) -> bool {
self.contains_query(query).is_some()
}
/// Checks to see if the range is in the [`CodePointInversionList`]
///
/// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
/// in the set using [`Vec`] implementation. Only runs the search once on the `start`
/// parameter, while the `end` parameter is checked in a single `O(1)` step.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x43, 0x44, 0x45];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert!(example.contains_range(&('A'..'C')));
/// assert!(example.contains_range(&('A'..='B')));
/// assert!(!example.contains_range(&('A'..='C')));
/// ```
///
/// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the
/// [`CodePointInversionList`] does not.
///
/// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are
/// constructed inclusive of start boundary and exclusive of end boundary.
/// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method
/// differs by including the end boundary.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use std::char;
/// let check =
/// char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap();
/// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001];
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert!(!example.contains_range(&(check)));
/// ```
pub fn contains_range(&self, range: &impl RangeBounds<char>) -> bool {
let (from, till) = deconstruct_range(range);
if from >= till {
return false;
}
match self.contains_query(from) {
Some(pos) => {
if let Some(x) = self.inv_list.get(pos + 1) {
(till) <= x
} else {
debug_assert!(
false,
"Inversion list query should not return out of bounds index"
);
false
}
}
None => false,
}
}
/// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// let a_to_d =
/// CodePointInversionList::try_from_inversion_list_slice(&[0x41, 0x45])
/// .unwrap();
/// let f_to_t =
/// CodePointInversionList::try_from_inversion_list_slice(&[0x46, 0x55])
/// .unwrap();
/// let r_to_x =
/// CodePointInversionList::try_from_inversion_list_slice(&[0x52, 0x58])
/// .unwrap();
/// assert!(example.contains_set(&a_to_d)); // contains all
/// assert!(!example.contains_set(&f_to_t)); // contains none
/// assert!(!example.contains_set(&r_to_x)); // contains some
/// ```
pub fn contains_set(&self, set: &Self) -> bool {
if set.size() > self.size() {
return false;
}
let mut set_ranges = set.iter_ranges();
let mut check_elem = set_ranges.next();
let ranges = self.iter_ranges();
for range in ranges {
match check_elem {
Some(ref check_range) => {
if check_range.start() >= range.start()
&& check_range.end() <= &(range.end() + 1)
{
check_elem = set_ranges.next();
}
}
_ => break,
}
}
check_elem.is_none()
}
/// Returns the end of the initial substring where the characters are either contained/not contained
/// in the set.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x44]; // {A, B, C}
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert_eq!(example.span("CABXYZ", true), 3);
/// assert_eq!(example.span("XYZC", false), 3);
/// assert_eq!(example.span("XYZ", true), 0);
/// assert_eq!(example.span("ABC", false), 0);
/// ```
pub fn span(&self, span_str: &str, contained: bool) -> usize {
span_str
.chars()
.take_while(|&x| self.contains(x) == contained)
.count()
}
/// Returns the start of the trailing substring (starting from end of string) where the characters are
/// either contained/not contained in the set. Returns the length of the string if no valid return.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// let example_list = [0x41, 0x44]; // {A, B, C}
/// let example =
/// CodePointInversionList::try_from_inversion_list_slice(&example_list)
/// .unwrap();
/// assert_eq!(example.span_back("XYZCAB", true), 3);
/// assert_eq!(example.span_back("ABCXYZ", true), 6);
/// assert_eq!(example.span_back("CABXYZ", false), 3);
/// ```
pub fn span_back(&self, span_str: &str, contained: bool) -> usize {
span_str.len()
- span_str
.chars()
.rev()
.take_while(|&x| self.contains(x) == contained)
.count()
}
}
#[cfg(test)]
mod tests {
use super::{CodePointInversionList, CodePointInversionListError};
use std::{char, vec::Vec};
use zerovec::ZeroVec;
#[test]
fn test_codepointinversionlist_try_from_vec() {
let ex = vec![0x2, 0x3, 0x4, 0x5];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(ex, check.get_inversion_list_vec());
assert_eq!(2, check.size());
}
#[test]
fn test_codepointinversionlist_try_from_vec_error() {
let check = vec![0x1, 0x1, 0x2, 0x3, 0x4];
let inv_list = ZeroVec::from_slice_or_alloc(&check);
let set = CodePointInversionList::try_from_inversion_list(inv_list);
assert!(matches!(
set,
Err(CodePointInversionListError::InvalidSet(_))
));
if let Err(CodePointInversionListError::InvalidSet(actual)) = set {
assert_eq!(&check, &actual);
}
}
// CodePointInversionList membership functions
#[test]
fn test_codepointinversionlist_contains_query() {
let ex = vec![0x41, 0x46, 0x4B, 0x55];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert!(check.contains_query(0x40).is_none());
assert_eq!(check.contains_query(0x41).unwrap(), 0);
assert_eq!(check.contains_query(0x44).unwrap(), 0);
assert!(check.contains_query(0x46).is_none());
assert_eq!(check.contains_query(0x4C).unwrap(), 2);
assert!(check.contains_query(0x56).is_none());
}
#[test]
fn test_codepointinversionlist_contains() {
let ex = vec![0x2, 0x5, 0xA, 0xF];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert!(check.contains(0x2 as char));
assert!(check.contains(0x4 as char));
assert!(check.contains(0xA as char));
assert!(check.contains(0xE as char));
}
#[test]
fn test_codepointinversionlist_contains_false() {
let ex = vec![0x2, 0x5, 0xA, 0xF];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert!(!check.contains(0x1 as char));
assert!(!check.contains(0x5 as char));
assert!(!check.contains(0x9 as char));
assert!(!check.contains(0xF as char));
assert!(!check.contains(0x10 as char));
}
#[test]
fn test_codepointinversionlist_contains_range() {
let ex = vec![0x41, 0x46, 0x4B, 0x55];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert!(check.contains_range(&('A'..='E'))); // 65 - 69
assert!(check.contains_range(&('C'..'D'))); // 67 - 67
assert!(check.contains_range(&('L'..'P'))); // 76 - 80
assert!(!check.contains_range(&('L'..='U'))); // 76 - 85
}
#[test]
fn test_codepointinversionlist_contains_range_false() {
let ex = vec![0x41, 0x46, 0x4B, 0x55];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert!(!check.contains_range(&('!'..'A'))); // 33 - 65
assert!(!check.contains_range(&('F'..'K'))); // 70 - 74
assert!(!check.contains_range(&('U'..))); // 85 - ..
}
#[test]
fn test_codepointinversionlist_contains_range_invalid() {
let check = CodePointInversionList::all();
assert!(!check.contains_range(&('A'..'!'))); // 65 - 33
assert!(!check.contains_range(&('A'..'A'))); // 65 - 65
}
#[test]
fn test_codepointinversionlist_contains_set_u() {
let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E];
let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D];
let s = CodePointInversionList::try_from_inversion_list_slice(&inside).unwrap();
assert!(u.contains_set(&s));
}
#[test]
fn test_codepointinversionlist_contains_set_u_false() {
let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78];
let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F];
let s = CodePointInversionList::try_from_inversion_list_slice(&outside).unwrap();
assert!(!u.contains_set(&s));
}
#[test]
fn test_codepointinversionlist_size() {
let ex = vec![0x2, 0x5, 0xA, 0xF];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(8, check.size());
let check = CodePointInversionList::all();
let expected = (char::MAX as u32) + 1;
assert_eq!(expected as usize, check.size());
let inv_list_vec: Vec<u32> = vec![];
let check = CodePointInversionList {
inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
size: 0,
};
assert_eq!(check.size(), 0);
}
#[test]
fn test_codepointinversionlist_is_empty() {
let inv_list_vec: Vec<u32> = vec![];
let check = CodePointInversionList {
inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
size: 0,
};
assert!(check.is_empty());
}
#[test]
fn test_codepointinversionlist_is_not_empty() {
let check = CodePointInversionList::all();
assert!(!check.is_empty());
}
#[test]
fn test_codepointinversionlist_iter_chars() {
let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let mut iter = check.iter_chars();
assert_eq!(Some('A'), iter.next());
assert_eq!(Some('B'), iter.next());
assert_eq!(Some('C'), iter.next());
assert_eq!(Some('E'), iter.next());
assert_eq!(None, iter.next());
}
#[test]
fn test_codepointinversionlist_iter_ranges() {
let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let mut ranges = set.iter_ranges();
assert_eq!(Some(0x41..=0x43), ranges.next());
assert_eq!(Some(0x45..=0x45), ranges.next());
assert_eq!(Some(0xD800..=0xD800), ranges.next());
assert_eq!(None, ranges.next());
}
#[test]
fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() {
let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let ranges = set.iter_ranges();
assert_eq!(3, ranges.len());
}
#[test]
fn test_codepointinversionlist_range_count() {
let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(3, set.get_range_count());
}
#[test]
fn test_codepointinversionlist_get_nth_range() {
let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(Some(0x41..=0x43), set.get_nth_range(0));
assert_eq!(Some(0x45..=0x45), set.get_nth_range(1));
assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2));
assert_eq!(None, set.get_nth_range(3));
}
// Range<char> cannot represent the upper bound (non-inclusive) for
// char::MAX, whereas Range<u32> can.
#[test]
fn test_codepointinversionlist_iter_ranges_with_max_code_point() {
let ex = vec![0x80, (char::MAX as u32) + 1];
let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
let mut ranges = set.iter_ranges();
assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next());
assert_eq!(None, ranges.next());
}
#[test]
fn test_codepointinversionlist_span_contains() {
let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(check.span("ABCDE", true), 3);
assert_eq!(check.span("E", true), 0);
}
#[test]
fn test_codepointinversionlist_span_does_not_contain() {
let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(check.span("DEF", false), 2);
assert_eq!(check.span("KLMA", false), 3);
}
#[test]
fn test_codepointinversionlist_span_back_contains() {
let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(check.span_back("XYZABFH", true), 3);
assert_eq!(check.span_back("ABCXYZ", true), 6);
}
#[test]
fn test_codepointinversionlist_span_back_does_not_contain() {
let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
assert_eq!(check.span_back("ABCXYZ", false), 3);
assert_eq!(check.span_back("XYZABC", false), 6);
}
#[test]
fn test_uniset_to_inv_list() {
let inv_list = [
0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028,
0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
];
let s: CodePointInversionList =
CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
let round_trip_inv_list = s.get_inversion_list_vec();
assert_eq!(round_trip_inv_list, inv_list);
}
#[test]
fn test_serde_serialize() {
let inv_list = [0x41, 0x46, 0x4B, 0x55];
let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
let json_str = serde_json::to_string(&uniset).unwrap();
assert_eq!(json_str, r#"["A-E","K-T"]"#);
}
#[test]
fn test_serde_serialize_surrogates() {
let inv_list = [0xDFAB, 0xDFFF];
let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
let json_str = serde_json::to_string(&uniset).unwrap();
assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#);
}
#[test]
fn test_serde_deserialize() {
let inv_list_str = r#"["A-E","K-T"]"#;
let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
let exp_uniset =
CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
assert_eq!(act_uniset, exp_uniset);
}
#[test]
fn test_serde_deserialize_surrogates() {
let inv_list_str = r#"["U+DFAB-U+DFFE"]"#;
let exp_inv_list = [0xDFAB, 0xDFFF];
let exp_uniset =
CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
assert_eq!(act_uniset, exp_uniset);
}
#[test]
fn test_serde_deserialize_legacy() {
let inv_list_str = "[65,70,75,85]";
let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
let exp_uniset =
CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
assert_eq!(act_uniset, exp_uniset);
}
#[test]
fn test_serde_deserialize_invalid() {
assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err());
assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err());
}
#[test]
fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
let set = CodePointInversionList::bmp();
let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap();
let set_deserialized: CodePointInversionList =
postcard::from_bytes::<CodePointInversionList>(&set_serialized)?;
assert_eq!(&set, &set_deserialized);
assert!(!set_deserialized.inv_list.is_owned());
Ok(())
}
#[test]
fn databake() {
databake::test_bake!(
CodePointInversionList<'static>,
const: unsafe {
#[allow(unused_unsafe)]
crate::codepointinvlist::CodePointInversionList::from_parts_unchecked(
unsafe {
zerovec::ZeroVec::from_bytes_unchecked(
b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0"
)
},
22u32,
)
},
icu_collections,
[zerovec],
);
}
}