Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#[macro_use]
extern crate arrayref;
extern crate memmap2;
#[macro_use]
extern crate log;
use std::slice;
use std::str;
use std::cmp::max;
use std::fs::File;
use std::mem;
use memmap2::Mmap;
// Make submodules available publicly.
pub mod builder;
pub mod ffi;
// 4-byte identification expected at beginning of a compiled dictionary file.
// (This will be updated if an incompatible change to the format is made in
// some future revision.)
const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0'];
const INVALID_STRING_OFFSET: u16 = 0xffff;
const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff;
const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels
const LEVEL_HEADER_SIZE: usize = 16;
// Transition actually holds a 24-bit new state offset and an 8-bit input byte
// to match. We will be interpreting byte ranges as Transition arrays (in the
// State::transitions() method below), so use repr(C) to ensure we have the
// memory layout we expect.
// Transition records do not depend on any specific alignment.
#[repr(C)]
#[derive(Debug,Copy,Clone)]
struct Transition(u8, u8, u8, u8);
impl Transition {
fn new_state_offset(&self) -> usize {
// Read a 24-bit little-endian number from three bytes.
self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16)
}
fn match_byte(&self) -> u8 {
self.3
}
}
// State is an area of the Level's data block that begins with a fixed header,
// followed by an array of transitions. The total size of each State's data
// depends on the number of transitions in the state. Only the basic header
// is defined by the struct here; the rest of the state is accessed via
// pointer magic.
// There are two versions of State, a basic version that supports only simple
// hyphenation (no associated spelling change), and an extended version that
// adds the replacement-string fields to support spelling changes at the
// hyphenation point. Check is_extended() to know which version is present.
// State records are NOT necessarily 4-byte aligned, so multi-byte fields
// should be read with care.
#[derive(Debug,Copy,Clone)]
#[repr(C)]
struct State {
fallback_state: [u8; 4],
match_string_offset: [u8; 2],
num_transitions: u8,
is_extended: u8,
}
#[repr(C)]
struct StateExtended {
state: State,
repl_string_offset: [u8; 2],
repl_index: i8,
repl_cut: i8,
}
impl State {
// Accessors for the various State header fields; see file format description.
fn fallback_state(&self) -> usize {
u32::from_le_bytes(self.fallback_state) as usize
}
fn match_string_offset(&self) -> usize {
u16::from_le_bytes(self.match_string_offset) as usize
}
fn num_transitions(&self) -> u8 {
self.num_transitions
}
fn is_extended(&self) -> bool {
self.is_extended != 0
}
// Accessors that are only valid if is_extended() is true.
// These use `unsafe` to dereference a pointer to the relevant field;
// this is OK because Level::get_state always validates the total state size
// before returning a state reference, so these pointers will be valid for
// any extended state it returns.
#[allow(dead_code)]
fn as_extended(&self) -> &StateExtended {
debug_assert!(self.is_extended());
unsafe { mem::transmute(self) }
}
#[allow(dead_code)]
fn repl_string_offset(&self) -> usize {
u16::from_le_bytes(self.as_extended().repl_string_offset) as usize
}
#[allow(dead_code)]
fn repl_index(&self) -> i8 {
self.as_extended().repl_index
}
#[allow(dead_code)]
fn repl_cut(&self) -> i8 {
self.as_extended().repl_cut
}
// Return the state's Transitions as a slice reference.
fn transitions(&self) -> &[Transition] {
let count = self.num_transitions() as usize;
if count == 0 {
return &[];
}
let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize;
// We know the `offset` here will not look beyond the valid range of memory
// because Level::get_state() checks the state length (accounting for the
// number of transitions) before returning a State reference.
let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition };
// Again, because Level::get_state() already checked the state length, we know
// this slice address and count will be valid.
unsafe { slice::from_raw_parts(trans_ptr, count) }
}
// Look up the Transition for a given input byte, or None.
fn transition_for(&self, b: u8) -> Option<Transition> {
// The transitions array is sorted by match_byte() value, but there are
// usually very few entries; benchmarking showed that using binary_search_by
// here gave no benefit (possibly slightly slower).
self.transitions().iter().copied().find(|t| t.match_byte() == b)
}
// Just for debugging use...
#[allow(dead_code)]
fn deep_show(&self, prefix: &str, dic: &Level) {
if self.match_string_offset() != INVALID_STRING_OFFSET as usize {
let match_string = dic.string_at_offset(self.match_string_offset());
println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap());
}
for t in self.transitions() {
println!("{}{} ->", prefix, t.match_byte() as char);
let next_prefix = format!("{} ", prefix);
dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic);
}
}
}
// We count the presentation-form ligature characters U+FB00..FB06 as multiple
// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these
// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this
// helper returns the "decomposed length" of the ligature given its trailing
// byte.
fn lig_length(trail_byte: u8) -> usize {
// This is only called on valid UTF-8 where we already know trail_byte
// must be >= 0x80.
// Ligature lengths: ff fi fl ffi ffl long-st st
const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8, 2u8 ];
if trail_byte > 0x86 {
return 1;
}
LENGTHS[trail_byte as usize - 0x80] as usize
}
fn is_utf8_trail_byte(byte: u8) -> bool {
(byte & 0xC0) == 0x80
}
fn is_ascii_digit(byte: u8) -> bool {
byte <= b'9' && byte >= b'0'
}
fn is_odd(byte: u8) -> bool {
(byte & 0x01) == 0x01
}
// A hyphenation Level has a header followed by State records and packed string
// data. The total size of the slice depends on the number and size of the
// States and Strings it contains.
// Note that the data of the Level may not have any specific alignment!
#[derive(Debug,Copy,Clone)]
struct Level<'a> {
data: &'a [u8],
// Header fields cached by the constructor for faster access:
state_data_base_: usize,
string_data_base_: usize,
}
impl Level<'_> {
// Constructor that initializes our cache variables.
fn new(data: &[u8]) -> Level {
Level {
data,
state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize,
string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize,
}
}
// Accessors for Level header fields; see file format description.
fn state_data_base(&self) -> usize {
self.state_data_base_ // cached by constructor
}
fn string_data_base(&self) -> usize {
self.string_data_base_ // cached by constructor
}
fn nohyphen_string_offset(&self) -> usize {
u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize
}
#[allow(dead_code)]
fn nohyphen_count(&self) -> u16 {
u16::from_le_bytes(*array_ref!(self.data, 10, 2))
}
fn lh_min(&self) -> usize {
max(1, self.data[12] as usize)
}
fn rh_min(&self) -> usize {
max(1, self.data[13] as usize)
}
fn clh_min(&self) -> usize {
max(1, self.data[14] as usize)
}
fn crh_min(&self) -> usize {
max(1, self.data[15] as usize)
}
fn word_boundary_mins(&self) -> (usize, usize, usize, usize) {
(self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min())
}
// Strings are represented as offsets from the Level's string_data_base.
// This returns a byte slice referencing the string at a given offset,
// or an empty slice if invalid.
fn string_at_offset(&self, offset: usize) -> &'_ [u8] {
if offset == INVALID_STRING_OFFSET as usize {
return &[];
}
let string_base = self.string_data_base() as usize + offset;
// TODO: move this to the validation function.
debug_assert!(string_base < self.data.len());
if string_base + 1 > self.data.len() {
return &[];
}
let len = self.data[string_base] as usize;
// TODO: move this to the validation function.
debug_assert!(string_base + 1 + len <= self.data.len());
if string_base + 1 + len > self.data.len() {
return &[];
}
self.data.get(string_base + 1 .. string_base + 1 + len).unwrap()
}
// The nohyphen field actually contains multiple NUL-separated substrings;
// return them as a vector of individual byte slices.
fn nohyphen(&self) -> Vec<&[u8]> {
let string_offset = self.nohyphen_string_offset();
let nohyph_str = self.string_at_offset(string_offset as usize);
if nohyph_str.is_empty() {
return vec![];
}
nohyph_str.split(|&b| b == 0).collect()
}
// States are represented as an offset from the Level's state_data_base.
// This returns a reference to the State at a given offset, or None if invalid.
fn get_state(&self, offset: usize) -> Option<&State> {
if offset == INVALID_STATE_OFFSET as usize {
return None;
}
debug_assert_eq!(offset & 3, 0);
let state_base = self.state_data_base() + offset;
// TODO: move this to the validation function.
debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base());
if state_base + mem::size_of::<State>() > self.string_data_base() {
return None;
}
let state_ptr = &self.data[state_base] as *const u8 as *const State;
// This is safe because we just checked against self.string_data_base() above.
let state = unsafe { state_ptr.as_ref().unwrap() };
let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() }
+ mem::size_of::<Transition>() * state.num_transitions() as usize;
// TODO: move this to the validation function.
debug_assert!(state_base + length <= self.string_data_base());
if state_base + length > self.string_data_base() {
return None;
}
// This is safe because we checked the full state length against self.string_data_base().
unsafe { state_ptr.as_ref() }
}
// Sets hyphenation values (odd = potential break, even = no break) in values[],
// and returns the change in the number of odd values present, so the caller can
// keep track of the total number of potential breaks in the word.
fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize {
// Bail out immediately if the word is too short to hyphenate.
if word.len() < lh_min + rh_min {
return 0;
}
let start_state = self.get_state(0);
let mut st = start_state;
let mut hyph_count = 0;
for i in 0 .. word.len() + 2 {
// Loop over the word by bytes, with a virtual '.' added at each end
// to match word-boundary patterns.
let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] };
loop {
// Loop to repeatedly fall back if we don't find a matching transition.
// Note that this could infinite-loop if there is a state whose fallback
// points to itself (or a cycle of fallbacks), but this would represent
// a table compilation error.
// (A potential validation function could check for fallback cycles.)
if st.is_none() {
st = start_state;
break;
}
let state = st.unwrap();
if let Some(tr) = state.transition_for(b) {
// Found a transition for the current byte. Look up the new state;
// if it has a match_string, merge its weights into `values`.
st = self.get_state(tr.new_state_offset());
if let Some(state) = st {
let match_offset = state.match_string_offset();
if match_offset != INVALID_STRING_OFFSET as usize {
if state.is_extended() {
debug_assert!(false, "extended hyphenation not supported by this function");
} else {
let match_str = self.string_at_offset(match_offset);
let offset = i + 1 - match_str.len();
assert!(offset + match_str.len() <= word.len() + 2);
for (j, ch) in match_str.iter().enumerate() {
let index = offset + j;
if index >= lh_min && index <= word.len() - rh_min {
// lh_min and rh_min are guaranteed to be >= 1,
// so this will not try to access outside values[].
let old_value = values[index - 1];
let value = ch - b'0';
if value > old_value {
if is_odd(old_value) != is_odd(value) {
// Adjust hyph_count for the change we're making
hyph_count += if is_odd(value) { 1 } else { -1 };
}
values[index - 1] = value;
}
}
}
}
}
}
// We have handled the current input byte; leave the fallback loop
// and get next input.
break;
}
// No transition for the current byte; go to fallback state and try again.
st = self.get_state(state.fallback_state());
}
}
// If the word was not purely ASCII, or if the word begins/ends with
// digits, the use of lh_min and rh_min above may not have correctly
// excluded enough positions, so we need to fix things up here.
let mut index = 0;
let mut count = 0;
let word_bytes = word.as_bytes();
let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; };
// Handle lh_min.
while count < lh_min - 1 && index < word_bytes.len() {
let byte = word_bytes[index];
clear_hyphen_at(index);
if byte < 0x80 {
index += 1;
if is_ascii_digit(byte) {
continue; // ASCII digits don't count
}
} else if byte == 0xEF && word_bytes[index + 1] == 0xAC {
// Unicode presentation-form ligature characters, which we count as
// multiple chars for the purpose of lh_min/rh_min, all begin with
// 0xEF, 0xAC in UTF-8.
count += lig_length(word_bytes[index + 2]);
clear_hyphen_at(index + 1);
clear_hyphen_at(index + 2);
index += 3;
continue;
} else {
index += 1;
while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index]) {
clear_hyphen_at(index);
index += 1;
}
}
count += 1;
}
// Handle rh_min.
count = 0;
index = word.len();
while count < rh_min && index > 0 {
index -= 1;
let byte = word_bytes[index];
if index < word.len() - 1 {
clear_hyphen_at(index);
}
if byte < 0x80 {
// Only count if not an ASCII digit
if !is_ascii_digit(byte) {
count += 1;
}
continue;
}
if is_utf8_trail_byte(byte) {
continue;
}
if byte == 0xEF && word_bytes[index + 1] == 0xAC {
// Presentation-form ligatures count as multiple chars.
count += lig_length(word_bytes[index + 2]);
continue;
}
count += 1;
}
hyph_count
}
}
/// Hyphenation engine encapsulating a language-specific set of patterns (rules)
/// that identify possible break positions within a word.
pub struct Hyphenator<'a>(&'a [u8]);
impl Hyphenator<'_> {
/// Return a Hyphenator that wraps the given buffer.
/// This does *not* check that the given buffer is in fact a valid hyphenation table.
/// Use `is_valid_hyphenator()` to determine whether it is usable.
/// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
/// unvalidated data is not unsafe, but may panic.)
pub fn new(buffer: &[u8]) -> Hyphenator {
Hyphenator(buffer)
}
// Internal implementation details
fn magic_number(&self) -> &[u8] {
&self.0[0 .. 4]
}
fn num_levels(&self) -> usize {
u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize
}
fn level(&self, i: usize) -> Level {
let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize;
let limit = if i == self.num_levels() - 1 {
self.0.len()
} else {
u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize
};
debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len());
debug_assert_eq!(offset & 3, 0);
debug_assert_eq!(limit & 3, 0);
Level::new(&self.0[offset .. limit])
}
/// Identify acceptable hyphenation positions in the given `word`.
///
/// The caller-supplied `values` must be at least as long as the `word`.
///
/// On return, any elements with an odd value indicate positions in the word
/// after which a hyphen could be inserted.
///
/// Returns the number of possible hyphenation positions that were found.
///
/// # Panics
/// If the given `values` slice is too small to hold the results.
///
/// If the block of memory represented by `self.0` is not in fact a valid
/// hyphenation dictionary, this function may panic with an overflow or
/// array bounds violation.
pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize {
assert!(values.len() >= word.len());
values.iter_mut().for_each(|x| *x = 0);
let top_level = self.level(0);
let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins();
if word.len() < lh_min + rh_min {
return 0;
}
let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min);
let compound = hyph_count > 0;
// Subsequent levels are applied to fragments between potential breaks
// already found:
for l in 1 .. self.num_levels() {
let level = self.level(l);
if hyph_count > 0 {
let mut begin = 0;
let mut lh = lh_min;
// lh_min and rh_min are both guaranteed to be greater than zero,
// so this loop will not reach fully to the end of the word.
for i in lh_min - 1 .. word.len() - rh_min {
if is_odd(values[i]) {
if i > begin {
// We've found a component of a compound;
// clear the corresponding values and apply the new level.
// (These values must be even, so hyph_count is unchanged.)
values[begin .. i].iter_mut().for_each(|x| {
*x = 0;
});
hyph_count += level.find_hyphen_values(&word[begin ..= i],
&mut values[begin ..= i],
lh, crh_min);
}
begin = i + 1;
lh = clh_min;
}
}
if begin == 0 {
// No compound-word breaks were found, just apply level to the whole word.
hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
} else if begin < word.len() {
// Handle trailing component of compound.
hyph_count += level.find_hyphen_values(&word[begin .. word.len()],
&mut values[begin .. word.len()],
clh_min, rh_min);
}
} else {
hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
}
}
// Only need to check nohyphen strings if top-level (compound) breaks were found.
if compound && hyph_count > 0 {
let nohyph = top_level.nohyphen();
if !nohyph.is_empty() {
for i in lh_min ..= word.len() - rh_min {
if is_odd(values[i - 1]) {
for nh in &nohyph {
if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] {
values[i - 1] = 0;
hyph_count -= 1;
break;
}
if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] {
values[i - 1] = 0;
hyph_count -= 1;
break;
}
}
}
}
}
}
hyph_count
}
/// Generate the hyphenated form of a `word` by inserting the given `hyphen_char`
/// at each valid break position.
///
/// # Panics
/// If the block of memory represented by `self` is not in fact a valid
/// hyphenation dictionary, this function may panic with an overflow or
/// array bounds violation.
///
/// Also panics if the length of the hyphenated word would overflow `usize`.
pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String {
let mut values = vec![0u8; word.len()];
let hyph_count = self.find_hyphen_values(word, &mut values);
if hyph_count <= 0 {
return word.to_string();
}
// We know how long the result will be, so we can preallocate here.
let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8();
let mut result = String::with_capacity(result_len);
let mut n = 0;
for ch in word.char_indices() {
if ch.0 > 0 && is_odd(values[ch.0 - 1]) {
result.push(hyphchar);
n += 1;
}
result.push(ch.1);
}
debug_assert_eq!(n, hyph_count);
debug_assert_eq!(result_len, result.len());
result
}
/// Check if the block of memory looks like it could be a valid hyphenation
/// table.
pub fn is_valid_hyphenator(&self) -> bool {
// Size must be at least 4 bytes for magic_number + 4 bytes num_levels;
// smaller than this cannot be safely inspected.
if self.0.len() < FILE_HEADER_SIZE {
return false;
}
if self.magic_number() != MAGIC_NUMBER {
return false;
}
// For each level, there's a 4-byte offset in the header, and the level
// has its own 16-byte header, so we can check a minimum size again here.
let num_levels = self.num_levels();
if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels {
return false;
}
// Check that state_data_base and string_data_base for each hyphenation
// level are within range.
for l in 0 .. num_levels {
let level = self.level(l);
if level.state_data_base() < LEVEL_HEADER_SIZE ||
level.state_data_base() > level.string_data_base() ||
level.string_data_base() > level.data.len() {
return false;
}
// TODO: consider doing more extensive validation of states and
// strings within the level?
}
// It's still possible the dic is internally broken, but at least it's
// worth trying to use it!
true
}
}
/// Load the compiled hyphenation file at `dic_path`, if present.
///
/// Returns `None` if the specified file cannot be opened or mapped,
/// otherwise returns a `memmap2::Mmap` mapping the file.
///
/// # Safety
///
/// This is unsafe for the same reason `Mmap::map()` is unsafe:
/// mapped_hyph does not guarantee safety if the mapped file is modified
/// (e.g. by another process) while we're using it.
///
/// This verifies that the file looks superficially like it may be a
/// compiled hyphenation table, but does *not* fully check the validity
/// of the file contents! Calling hyphenation functions with the returned
/// data is not unsafe, but may panic if the data is invalid.
pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> {
let file = File::open(dic_path).ok()?;
let dic = Mmap::map(&file).ok()?;
let hyph = Hyphenator(&*dic);
if hyph.is_valid_hyphenator() {
return Some(dic);
}
None
}