Revision control
Copy as Markdown
Other Tools
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
use icu_segmenter::GraphemeClusterSegmenter;
use icu_segmenter::LineSegmenter;
use icu_segmenter::SentenceSegmenter;
use icu_segmenter::WordSegmenter;
use std::char;
struct TestContentIterator(core::str::Split<'static, char>);
struct TestData {
original_line: &'static str,
utf8_vec: Vec<char>,
utf16_vec: Vec<u16>,
latin1_vec: Vec<u8>,
break_result_utf8: Vec<usize>,
break_result_utf16: Vec<usize>,
break_result_latin1: Option<Vec<usize>>,
}
impl TestContentIterator {
pub fn new(file: &'static str) -> Self {
Self(file.split('\n'))
}
}
impl Iterator for TestContentIterator {
type Item = TestData;
fn next(&mut self) -> Option<Self::Item> {
loop {
let line = self.0.next()?;
if line.is_empty() {
// EOF
return None;
}
if line.starts_with('#') {
// Comment
continue;
}
let mut r = line.split('#');
let r = r.next();
let v = r.unwrap().split_ascii_whitespace();
let mut char_break: Vec<_> = Vec::new();
let mut u8_break: Vec<_> = Vec::new();
let mut u16_break: Vec<_> = Vec::new();
let mut char_vec: Vec<_> = Vec::new();
let mut u8_vec: Vec<_> = Vec::new();
let mut u16_vec: Vec<_> = Vec::new();
let mut char_len = 0;
let mut u8_len = 0;
let mut u16_len = 0;
let mut ascii_only = true;
for (count, item) in v.enumerate() {
if count % 2 == 1 {
let ch = char::from_u32(u32::from_str_radix(item, 16).unwrap()).unwrap();
char_vec.push(ch);
char_len += ch.len_utf8();
if ch as u32 >= 0x100 {
ascii_only = false;
} else {
u8_vec.push(ch as u8);
u8_len += 1;
}
let mut u16_buf = [0; 2];
let ch_u16 = ch.encode_utf16(&mut u16_buf);
u16_vec.extend_from_slice(ch_u16);
u16_len += ch_u16.len();
} else if item != "\u{00d7}" {
assert_eq!(item, "\u{00f7}");
char_break.push(char_len);
u8_break.push(u8_len);
u16_break.push(u16_len);
}
}
return Some(Self::Item {
original_line: line,
utf8_vec: char_vec,
utf16_vec: u16_vec,
latin1_vec: u8_vec,
break_result_utf8: char_break,
break_result_utf16: u16_break,
break_result_latin1: if ascii_only { Some(u8_break) } else { None },
});
}
}
}
fn line_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = LineSegmenter::new_dictionary();
for (i, mut test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
// NOTE: For consistency with ICU4C and other Segmenters, we return a breakpoint at
// index 0, despite UAX #14 suggesting otherwise. See issue #3283.
if test.break_result_utf8.first() != Some(&0) {
test.break_result_utf8.insert(0, 0);
}
if result != test.break_result_utf8 {
let lb = icu::properties::maps::line_break();
let lb_name = icu::properties::LineBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Line_Break | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>18} | {}",
if actual_break != expected_break {
"ðŸ˜"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
lb_name
.get(lb.get(c))
.unwrap_or(&format!("{:?}", lb.get(c))),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
if test.break_result_utf16.first() != Some(&0) {
test.break_result_utf16.insert(0, 0);
}
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(mut break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
if break_result_latin1.first() != Some(&0) {
break_result_latin1.insert(0, 0);
}
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_line_break_test() {
line_break_test(include_str!("testdata/LineBreakTest.txt"));
}
#[test]
fn run_line_break_extra_test() {
line_break_test(include_str!("testdata/LineBreakExtraTest.txt"));
}
fn word_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = WordSegmenter::new_dictionary();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let wb = icu::properties::maps::word_break();
let wb_name = icu::properties::WordBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Word_Break | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"ðŸ˜"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
wb_name
.get(wb.get(c))
.unwrap_or(&format!("{:?}", wb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_word_break_test() {
word_break_test(include_str!("testdata/WordBreakTest.txt"));
}
#[test]
fn run_word_break_extra_test() {
word_break_test(include_str!("testdata/WordBreakExtraTest.txt"));
}
fn grapheme_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = GraphemeClusterSegmenter::new();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let gcb = icu::properties::maps::grapheme_cluster_break();
let gcb_name = icu::properties::GraphemeClusterBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | GCB | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"ðŸ˜"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
gcb_name
.get(gcb.get(c))
.unwrap_or(&format!("{:?}", gcb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_grapheme_break_test() {
grapheme_break_test(include_str!("testdata/GraphemeBreakTest.txt"));
}
#[test]
fn run_grapheme_break_extra_test() {
grapheme_break_test(include_str!("testdata/GraphemeBreakExtraTest.txt"));
}
fn sentence_break_test(file: &'static str) {
let test_iter = TestContentIterator::new(file);
let segmenter = SentenceSegmenter::new();
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
if result != test.break_result_utf8 {
let sb = icu::properties::maps::sentence_break();
let sb_name = icu::properties::SentenceBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"ðŸ˜"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
sb_name
.get(sb.get(c))
.unwrap_or(&format!("{:?}", sb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}
let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, test.break_result_utf16,
"UTF16: {}",
test.original_line
);
// Test data is Latin-1 character only, it can run for Latin-1 segmenter test.
if let Some(break_result_latin1) = test.break_result_latin1 {
let iter = segmenter.segment_latin1(&test.latin1_vec);
let result: Vec<usize> = iter.collect();
assert_eq!(
result, break_result_latin1,
"Latin1: {}",
test.original_line
);
}
}
}
#[test]
fn run_sentence_break_test() {
sentence_break_test(include_str!("testdata/SentenceBreakTest.txt"));
}
#[test]
fn run_sentence_break_extra_test() {
sentence_break_test(include_str!("testdata/SentenceBreakExtraTest.txt"));
}
#[test]
fn run_sentence_break_random_test() {
sentence_break_test(include_str!("testdata/SentenceBreakRandomTest.txt"));
}