WordSegmenter.d.ts

// generated by diplomat-tool

import type { DataError } from "./DataError"

import type { DataProvider } from "./DataProvider"

import type { Locale } from "./Locale"

import type { WordBreakIteratorUtf16 } from "./WordBreakIteratorUtf16"

import type { pointer, codepoint } from "./diplomat-runtime.d.ts";

/**

 * An ICU4X word-break segmenter, capable of finding word breakpoints in strings.

 * See the [Rust documentation for `WordSegmenter`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html) for more information.

*/

export class WordSegmenter {

    get ffiValue(): pointer;

/**

     * Construct an [`WordSegmenter`] with automatically selecting the best available LSTM

     * or dictionary payload data, using compiled data. This does not assume any content locale.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `new_auto`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_auto) for more information.

*/

    static createAuto(): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with automatically selecting the best available LSTM

     * or dictionary payload data, using compiled data.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_auto`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_auto) for more information.

*/

    static createAutoWithContentLocale(locale: Locale): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with automatically selecting the best available LSTM

     * or dictionary payload data, using a particular data source.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_auto`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_auto) for more information.

*/

    static createAutoWithContentLocaleAndProvider(provider: DataProvider, locale: Locale): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and

     * Thai, using compiled data.  This does not assume any content locale.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `new_lstm`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_lstm) for more information.

*/

    static createLstm(): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and

     * Thai, using compiled data.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_lstm`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_lstm) for more information.

*/

    static createLstmWithContentLocale(locale: Locale): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and

     * Thai, using a particular data source.

     * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_lstm`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_lstm) for more information.

*/

    static createLstmWithContentLocaleAndProvider(provider: DataProvider, locale: Locale): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with with dictionary payload data for Chinese, Japanese,

     * Burmese, Khmer, Lao, and Thai, using compiled data.  This does not assume any content locale.

     * Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `new_dictionary`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_dictionary) for more information.

*/

    static createDictionary(): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,

     * Burmese, Khmer, Lao, and Thai, using compiled data.

     * Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_dictionary`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_dictionary) for more information.

*/

    static createDictionaryWithContentLocale(locale: Locale): WordSegmenter;

/**

     * Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,

     * Burmese, Khmer, Lao, and Thai, using a particular data source.

     * Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,

     * Khmer, Lao, and Thai.

     * See the [Rust documentation for `try_new_dictionary`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.try_new_dictionary) for more information.

*/

    static createDictionaryWithContentLocaleAndProvider(provider: DataProvider, locale: Locale): WordSegmenter;

/**

     * Segments a string.

     * Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according

     * to the WHATWG Encoding Standard.

     * See the [Rust documentation for `segment_utf16`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenterBorrowed.html#method.segment_utf16) for more information.

*/

    segment(input: string): WordBreakIteratorUtf16;

Source code

Revision control

Copy as Markdown

Other Tools