Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
/* Classes to iterate over grapheme, word, sentence, or line. */
#ifndef intl_components_Segmenter_h_
#define intl_components_Segmenter_h_
#include "mozilla/intl/ICUError.h"
#include "mozilla/Maybe.h"
#include "mozilla/Result.h"
#include "mozilla/Span.h"
#include "mozilla/UniquePtr.h"
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
namespace capi {
struct ICU4XLineSegmenter;
struct ICU4XLineBreakIteratorUtf16;
struct ICU4XWordSegmenter;
struct ICU4XWordBreakIteratorUtf16;
struct ICU4XGraphemeClusterSegmenter;
struct ICU4XGraphemeClusterBreakIteratorUtf16;
struct ICU4XSentenceSegmenter;
struct ICU4XSentenceBreakIteratorUtf16;
} // namespace capi
#endif
namespace mozilla::intl {
enum class SegmenterGranularity : uint8_t {
Grapheme,
Word,
Sentence,
Line,
};
struct SegmenterOptions final {
SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
};
/**
* Interface of segment iterators. Subclass this class to implement iterator for
* UTF-16 text.
*/
class SegmentIteratorUtf16 {
public:
virtual ~SegmentIteratorUtf16() = default;
// Disable copy or move semantics. Move semantic could be enabled in the
// future if needed.
SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
/**
* Advance the iterator to the next break position.
*
* @return the break position. If there's no further break position, return
* Nothing().
*/
virtual Maybe<uint32_t> Next() = 0;
/**
* Advance the iterator to the first break position following the specified
* position aPos.
*
* Note: if this iterator's current position is already >= aPos, this method
* behaves the same as Next().
*/
virtual Maybe<uint32_t> Seek(uint32_t aPos);
protected:
explicit SegmentIteratorUtf16(Span<const char16_t> aText);
// The text to iterate over.
Span<const char16_t> mText;
// The current break position within mText.
uint32_t mPos = 0;
};
// Each enum value has the same meaning with respect to the `word-break`
// property values in the CSS Text spec. See the details in
enum class WordBreakRule : uint8_t {
Normal = 0,
BreakAll,
KeepAll,
};
// Each enum value has the same meaning with respect to the `line-break`
// property values in the CSS Text spec. See the details in
enum class LineBreakRule : uint8_t {
Auto = 0,
Loose,
Normal,
Strict,
Anywhere,
};
// Extra options for line break iterator.
struct LineBreakOptions final {
WordBreakRule mWordBreakRule = WordBreakRule::Normal;
LineBreakRule mLineBreakRule = LineBreakRule::Auto;
bool mScriptIsChineseOrJapanese = false;
};
/**
* Line break iterator for UTF-16 text.
*/
class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
const LineBreakOptions& aOptions = {});
~LineBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
private:
LineBreakOptions mOptions;
#ifdef MOZ_ICU4X
capi::ICU4XLineSegmenter* mSegmenter = nullptr;
capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
* Word break iterator for UTF-16 text.
*/
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
~WordBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
private:
capi::ICU4XWordSegmenter* mSegmenter = nullptr;
capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
* Grapheme cluster break iterator for UTF-16 text.
*/
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
~GraphemeClusterBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
private:
static capi::ICU4XGraphemeClusterSegmenter* sSegmenter;
capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
* Grapheme cluster break reverse iterator for UTF-16 text.
*
* Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
* at your own risk.
*/
class GraphemeClusterBreakReverseIteratorUtf16 final
: public SegmentIteratorUtf16 {
public:
explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
};
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
/**
* Sentence break iterator for UTF-16 text.
*/
class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
~SentenceBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
private:
capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
};
#endif
/**
* This component is a Mozilla-focused API for working with segmenters in
* internationalization code.
*
* This is a factor class. Calling Segment() to create an iterator over a text
* of given granularity.
*/
class Segmenter final {
public:
// NOTE: aLocale is a no-op currently.
static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
Span<const char> aLocale, const SegmenterOptions& aOptions);
explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
: mOptions(aOptions) {}
// Creates an iterator over aText of a given granularity in mOptions.
UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
// TODO: Implement an iterator for Latin1 text.
// UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
private:
SegmenterOptions mOptions;
};
} // namespace mozilla::intl
#endif