mod.rs - mozsearch

comm-central/third_party/rust/icu_segmenter/src/complex/lstm/mod.rs

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

// This file is part of ICU4X. For terms of use, please see the file

// called LICENSE at the top level of the ICU4X source tree

// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::grapheme::GraphemeClusterSegmenter;

use crate::provider::*;

use alloc::vec::Vec;

use core::char::{decode_utf16, REPLACEMENT_CHARACTER};

use zerovec::{maps::ZeroMapBorrowed, ule::UnvalidatedStr};

mod matrix;

use matrix::*;

// A word break iterator using LSTM model. Input string have to be same language.

struct LstmSegmenterIterator<'s> {

    input: &'s str,

    pos_utf8: usize,

    bies: BiesIterator<'s>,

impl Iterator for LstmSegmenterIterator<'_> {

    type Item = usize;

    fn next(&mut self) -> Option<Self::Item> {

        #[allow(clippy::indexing_slicing)] // pos_utf8 in range

        loop {

            let is_e = self.bies.next()?;

            self.pos_utf8 += self.input[self.pos_utf8..].chars().next()?.len_utf8();

            if is_e || self.bies.len() == 0 {

                return Some(self.pos_utf8);

struct LstmSegmenterIteratorUtf16<'s> {

    bies: BiesIterator<'s>,

    pos: usize,

impl Iterator for LstmSegmenterIteratorUtf16<'_> {

    type Item = usize;

    fn next(&mut self) -> Option<Self::Item> {

        loop {

            self.pos += 1;

            if self.bies.next()? || self.bies.len() == 0 {

                return Some(self.pos);

pub(super) struct LstmSegmenter<'l> {

    dic: ZeroMapBorrowed<'l, UnvalidatedStr, u16>,

    embedding: MatrixZero<'l, 2>,

    fw_w: MatrixZero<'l, 3>,

    fw_u: MatrixZero<'l, 3>,

    fw_b: MatrixZero<'l, 2>,

    bw_w: MatrixZero<'l, 3>,

    bw_u: MatrixZero<'l, 3>,

    bw_b: MatrixZero<'l, 2>,

    timew_fw: MatrixZero<'l, 2>,

    timew_bw: MatrixZero<'l, 2>,

    time_b: MatrixZero<'l, 1>,

    grapheme: Option<&'l RuleBreakDataV1<'l>>,

impl<'l> LstmSegmenter<'l> {

    /// Returns `Err` if grapheme data is required but not present

    pub(super) fn new(lstm: &'l LstmDataV1<'l>, grapheme: &'l RuleBreakDataV1<'l>) -> Self {

        let LstmDataV1::Float32(lstm) = lstm;

        let time_w = MatrixZero::from(&lstm.time_w);

        #[allow(clippy::unwrap_used)] // shape (2, 4, hunits)

        let timew_fw = time_w.submatrix(0).unwrap();

        #[allow(clippy::unwrap_used)] // shape (2, 4, hunits)

        let timew_bw = time_w.submatrix(1).unwrap();

        Self {

            dic: lstm.dic.as_borrowed(),

            embedding: MatrixZero::from(&lstm.embedding),

            fw_w: MatrixZero::from(&lstm.fw_w),

            fw_u: MatrixZero::from(&lstm.fw_u),

            fw_b: MatrixZero::from(&lstm.fw_b),

            bw_w: MatrixZero::from(&lstm.bw_w),

            bw_u: MatrixZero::from(&lstm.bw_u),

            bw_b: MatrixZero::from(&lstm.bw_b),

            timew_fw,

            timew_bw,

            time_b: MatrixZero::from(&lstm.time_b),

            grapheme: (lstm.model == ModelType::GraphemeClusters).then_some(grapheme),

    /// Create an LSTM based break iterator for an `str` (a UTF-8 string).

    pub(super) fn segment_str(&'l self, input: &'l str) -> impl Iterator<Item = usize> + 'l {

        self.segment_str_p(input)

    // For unit testing as we cannot inspect the opaque type's bies

    fn segment_str_p(&'l self, input: &'l str) -> LstmSegmenterIterator<'l> {

        let input_seq = if let Some(grapheme) = self.grapheme {

            GraphemeClusterSegmenter::new_and_segment_str(input, grapheme)

                .collect::<Vec<usize>>()

                .windows(2)

                .map(|chunk| {

                    let range = if let [first, second, ..] = chunk {

                        *first..*second

                    } else {

                        unreachable!()

};

                    let grapheme_cluster = if let Some(grapheme_cluster) = input.get(range) {

                        grapheme_cluster

                    } else {

                        return self.dic.len() as u16;

};

                    self.dic

                        .get_copied(UnvalidatedStr::from_str(grapheme_cluster))

                        .unwrap_or_else(|| self.dic.len() as u16)

})

                .collect()

        } else {

            input

                .chars()

                .map(|c| {

                    self.dic

                        .get_copied(UnvalidatedStr::from_str(c.encode_utf8(&mut [0; 4])))

                        .unwrap_or_else(|| self.dic.len() as u16)

})

                .collect()

};

        LstmSegmenterIterator {

            input,

            pos_utf8: 0,

            bies: BiesIterator::new(self, input_seq),

    /// Create an LSTM based break iterator for a UTF-16 string.

    pub(super) fn segment_utf16(&'l self, input: &[u16]) -> impl Iterator<Item = usize> + 'l {

        let input_seq = if let Some(grapheme) = self.grapheme {

            GraphemeClusterSegmenter::new_and_segment_utf16(input, grapheme)

                .collect::<Vec<usize>>()

                .windows(2)

                .map(|chunk| {

                    let range = if let [first, second, ..] = chunk {

                        *first..*second

                    } else {

                        unreachable!()

};

                    let grapheme_cluster = if let Some(grapheme_cluster) = input.get(range) {

                        grapheme_cluster

                    } else {

                        return self.dic.len() as u16;

};

                    self.dic

                        .get_copied_by(|key| {

                            key.as_bytes().iter().copied().cmp(

                                decode_utf16(grapheme_cluster.iter().copied()).flat_map(|c| {

                                    let mut buf = [0; 4];

                                    let len = c

                                        .unwrap_or(REPLACEMENT_CHARACTER)

                                        .encode_utf8(&mut buf)

                                        .len();

                                    buf.into_iter().take(len)

}),

})

                        .unwrap_or_else(|| self.dic.len() as u16)

})

                .collect()

        } else {

            decode_utf16(input.iter().copied())

                .map(|c| c.unwrap_or(REPLACEMENT_CHARACTER))

                .map(|c| {

                    self.dic

                        .get_copied(UnvalidatedStr::from_str(c.encode_utf8(&mut [0; 4])))

                        .unwrap_or_else(|| self.dic.len() as u16)

})

                .collect()

};

        LstmSegmenterIteratorUtf16 {

            bies: BiesIterator::new(self, input_seq),

            pos: 0,

struct BiesIterator<'l> {

    segmenter: &'l LstmSegmenter<'l>,

    input_seq: core::iter::Enumerate<alloc::vec::IntoIter<u16>>,

    h_bw: MatrixOwned<2>,

    curr_fw: MatrixOwned<1>,

    c_fw: MatrixOwned<1>,

impl<'l> BiesIterator<'l> {

    // input_seq is a sequence of id numbers that represents grapheme clusters or code points in the input line. These ids are used later

    // in the embedding layer of the model.

    fn new(segmenter: &'l LstmSegmenter<'l>, input_seq: Vec<u16>) -> Self {

        let hunits = segmenter.fw_u.dim().1;

        // Backward LSTM

        let mut c_bw = MatrixOwned::<1>::new_zero([hunits]);

        let mut h_bw = MatrixOwned::<2>::new_zero([input_seq.len(), hunits]);

        for (i, &g_id) in input_seq.iter().enumerate().rev() {

            if i + 1 < input_seq.len() {

                h_bw.as_mut().copy_submatrix::<1>(i + 1, i);

            #[allow(clippy::unwrap_used)]

            compute_hc(

                segmenter.embedding.submatrix::<1>(g_id as usize).unwrap(), /* shape (dict.len() + 1, hunit), g_id is at most dict.len() */

                h_bw.submatrix_mut(i).unwrap(), // shape (input_seq.len(), hunits)

                c_bw.as_mut(),

                segmenter.bw_w,

                segmenter.bw_u,

                segmenter.bw_b,

);

        Self {

            input_seq: input_seq.into_iter().enumerate(),

            h_bw,

            c_fw: MatrixOwned::<1>::new_zero([hunits]),

            curr_fw: MatrixOwned::<1>::new_zero([hunits]),

            segmenter,

impl ExactSizeIterator for BiesIterator<'_> {

    fn len(&self) -> usize {

        self.input_seq.len()

impl Iterator for BiesIterator<'_> {

    type Item = bool;

    fn next(&mut self) -> Option<Self::Item> {

        let (i, g_id) = self.input_seq.next()?;

        #[allow(clippy::unwrap_used)]

        compute_hc(

            self.segmenter

                .embedding

                .submatrix::<1>(g_id as usize)

                .unwrap(), // shape (dict.len() + 1, hunit), g_id is at most dict.len()

            self.curr_fw.as_mut(),

            self.c_fw.as_mut(),

            self.segmenter.fw_w,

            self.segmenter.fw_u,

            self.segmenter.fw_b,

);

        #[allow(clippy::unwrap_used)] // shape (input_seq.len(), hunits)

        let curr_bw = self.h_bw.submatrix::<1>(i).unwrap();

        let mut weights = [0.0; 4];

        let mut curr_est = MatrixBorrowedMut {

            data: &mut weights,

            dims: [4],

};

        curr_est.add_dot_2d(self.curr_fw.as_borrowed(), self.segmenter.timew_fw);

        curr_est.add_dot_2d(curr_bw, self.segmenter.timew_bw);

        #[allow(clippy::unwrap_used)] // both shape (4)

        curr_est.add(self.segmenter.time_b).unwrap();

        // For correct BIES weight calculation we'd now have to apply softmax, however

        // we're only doing a naive argmax, so a monotonic function doesn't make a difference.

        Some(weights[2] > weights[0] && weights[2] > weights[1] && weights[2] > weights[3])

/// `compute_hc1` implemens the evaluation of one LSTM layer.

fn compute_hc<'a>(

    x_t: MatrixZero<'a, 1>,

    mut h_tm1: MatrixBorrowedMut<'a, 1>,

    mut c_tm1: MatrixBorrowedMut<'a, 1>,

    w: MatrixZero<'a, 3>,

    u: MatrixZero<'a, 3>,

    b: MatrixZero<'a, 2>,

) {

    #[cfg(debug_assertions)]

        let hunits = h_tm1.dim();

        let embedd_dim = x_t.dim();

        c_tm1.as_borrowed().debug_assert_dims([hunits]);

        w.debug_assert_dims([4, hunits, embedd_dim]);

        u.debug_assert_dims([4, hunits, hunits]);

        b.debug_assert_dims([4, hunits]);

    let mut s_t = b.to_owned();

    s_t.as_mut().add_dot_3d_2(x_t, w);

    s_t.as_mut().add_dot_3d_1(h_tm1.as_borrowed(), u);

    #[allow(clippy::unwrap_used)] // first dimension is 4

    s_t.submatrix_mut::<1>(0).unwrap().sigmoid_transform();

    #[allow(clippy::unwrap_used)] // first dimension is 4

    s_t.submatrix_mut::<1>(1).unwrap().sigmoid_transform();

    #[allow(clippy::unwrap_used)] // first dimension is 4

    s_t.submatrix_mut::<1>(2).unwrap().tanh_transform();

    #[allow(clippy::unwrap_used)] // first dimension is 4

    s_t.submatrix_mut::<1>(3).unwrap().sigmoid_transform();

    #[allow(clippy::unwrap_used)] // first dimension is 4

    c_tm1.convolve(

        s_t.as_borrowed().submatrix(0).unwrap(),

        s_t.as_borrowed().submatrix(2).unwrap(),

        s_t.as_borrowed().submatrix(1).unwrap(),

);

    #[allow(clippy::unwrap_used)] // first dimension is 4

    h_tm1.mul_tanh(s_t.as_borrowed().submatrix(3).unwrap(), c_tm1.as_borrowed());

#[cfg(test)]

mod tests {

    use super::*;

    use icu_locid::langid;

    use icu_provider::prelude::*;

    use serde::Deserialize;

    /// `TestCase` is a struct used to store a single test case.

    /// Each test case has two attributes: `unseg` which denotes the unsegmented line, and `true_bies` which indicates the Bies

    /// sequence representing the true segmentation.

    #[derive(PartialEq, Debug, Deserialize)]

    struct TestCase {

        unseg: String,

        expected_bies: String,

        true_bies: String,

    /// `TestTextData` is a struct to store a vector of `TestCase` that represents a test text.

    #[derive(PartialEq, Debug, Deserialize)]

    struct TestTextData {

        testcases: Vec<TestCase>,

    #[derive(Debug)]

    struct TestText {

        data: TestTextData,

    #[test]

    fn segment_file_by_lstm() {

        let lstm: DataPayload<LstmForWordLineAutoV1Marker> = crate::provider::Baked

            .load(DataRequest {

                locale: &langid!("th").into(),

                metadata: Default::default(),

})

            .unwrap()

            .take_payload()

            .unwrap();

        let lstm = LstmSegmenter::new(

            lstm.get(),

            crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,

);

        // Importing the test data

        let test_text_data = serde_json::from_str(if lstm.grapheme.is_some() {

            include_str!("../../../tests/testdata/test_text_graphclust.json")

        } else {

            include_str!("../../../tests/testdata/test_text_codepoints.json")

})

        .expect("JSON syntax error");

        let test_text = TestText {

            data: test_text_data,

};

        // Testing

        for test_case in &test_text.data.testcases {

            let lstm_output = lstm

                .segment_str_p(&test_case.unseg)

                .bies

                .map(|is_e| if is_e { 'e' } else { '?' })

                .collect::<String>();

            println!("Test case      : {}", test_case.unseg);

            println!("Expected bies  : {}", test_case.expected_bies);

            println!("Estimated bies : {lstm_output}");

            println!("True bies      : {}", test_case.true_bies);

            println!("****************************************************");

            assert_eq!(

                test_case.expected_bies.replace(['b', 'i', 's'], "?"),

                lstm_output

);