special.rs - mozsearch

use crate::{

    dfa::DEAD,

    util::{

        primitives::StateID,

        wire::{self, DeserializeError, Endian, SerializeError},

},

};

macro_rules! err {

    ($msg:expr) => {

        return Err(DeserializeError::generic($msg));

};

// Special represents the identifiers in a DFA that correspond to "special"

// states. If a state is one or more of the following, then it is considered

// special:

//

// * dead - A non-matching state where all outgoing transitions lead back to

//   itself. There is only one of these, regardless of whether minimization

//   has run. The dead state always has an ID of 0. i.e., It is always the

//   first state in a DFA.

// * quit - A state that is entered whenever a byte is seen that should cause

//   a DFA to give up and stop searching. This results in a MatchError::quit

//   error being returned at search time. The default configuration for a DFA

//   has no quit bytes, which means this state is unreachable by default,

//   although it is always present for reasons of implementation simplicity.

//   This state is only reachable when the caller configures the DFA to quit

//   on certain bytes. There is always exactly one of these states and it

//   is always the second state. (Its actual ID depends on the size of the

//   alphabet in dense DFAs, since state IDs are premultiplied in order to

//   allow them to be used directly as indices into the transition table.)

// * match - An accepting state, i.e., indicative of a match. There may be

//   zero or more of these states.

// * accelerated - A state where all of its outgoing transitions, except a

//   few, loop back to itself. These states are candidates for acceleration

//   via memchr during search. There may be zero or more of these states.

// * start - A non-matching state that indicates where the automaton should

//   start during a search. There is always at least one starting state and

//   all are guaranteed to be non-match states. (A start state cannot be a

//   match state because the DFAs in this crate delay all matches by one byte.

//   So every search that finds a match must move through one transition to

//   some other match state, even when searching an empty string.)

//

// These are not mutually exclusive categories. Namely, the following

// overlappings can occur:

//

// * {dead, start} - If a DFA can never lead to a match and it is minimized,

//   then it will typically compile to something where all starting IDs point

//   to the DFA's dead state.

// * {match, accelerated} - It is possible for a match state to have the

//   majority of its transitions loop back to itself, which means it's

//   possible for a match state to be accelerated.

// * {start, accelerated} - Similarly, it is possible for a start state to be

//   accelerated. Note that it is possible for an accelerated state to be

//   neither a match or a start state. Also note that just because both match

//   and start states overlap with accelerated states does not mean that

//   match and start states overlap with each other. In fact, they are

//   guaranteed not to overlap.

//

// As a special mention, every DFA always has a dead and a quit state, even

// though from the perspective of the DFA, they are equivalent. (Indeed,

// minimization special cases them to ensure they don't get merged.) The

// purpose of keeping them distinct is to use the quit state as a sentinel to

// distguish between whether a search finished successfully without finding

// anything or whether it gave up before finishing.

//

// So the main problem we want to solve here is the *fast* detection of whether

// a state is special or not. And we also want to do this while storing as

// little extra data as possible. AND we want to be able to quickly determine

// which categories a state falls into above if it is special.

//

// We achieve this by essentially shuffling all special states to the beginning

// of a DFA. That is, all special states appear before every other non-special

// state. By representing special states this way, we can determine whether a

// state is special or not by a single comparison, where special.max is the

// identifier of the last special state in the DFA:

//

//     if current_state <= special.max:

//         ... do something with special state

//

// The only thing left to do is to determine what kind of special state

// it is. Because what we do next depends on that. Since special states

// are typically rare, we can afford to do a bit more extra work, but we'd

// still like this to be as fast as possible. The trick we employ here is to

// continue shuffling states even within the special state range. Such that

// one contiguous region corresponds to match states, another for start states

// and then an overlapping range for accelerated states. At a high level, our

// special state detection might look like this (for leftmost searching, where

// we continue searching even after seeing a match):

//

//     byte = input[offset]

//     current_state = next_state(current_state, byte)

//     offset += 1

//     if current_state <= special.max:

//         if current_state == 0:

//             # We can never leave a dead state, so this always marks the

//             # end of our search.

//             return last_match

//         if current_state == special.quit_id:

//             # A quit state means we give up. If he DFA has no quit state,

//             # then special.quit_id == 0 == dead, which is handled by the

//             # conditional above.

//             return Err(MatchError::quit { byte, offset: offset - 1 })

//         if special.min_match <= current_state <= special.max_match:

//             last_match = Some(offset)

//             if special.min_accel <= current_state <= special.max_accel:

//                 offset = accelerate(input, offset)

//                 last_match = Some(offset)

//         elif special.min_start <= current_state <= special.max_start:

//             offset = prefilter.find(input, offset)

//             if special.min_accel <= current_state <= special.max_accel:

//                 offset = accelerate(input, offset)

//         elif special.min_accel <= current_state <= special.max_accel:

//             offset = accelerate(input, offset)

//

// There are some small details left out of the logic above. For example,

// in order to accelerate a state, we need to know which bytes to search for.

// This in turn implies some extra data we need to store in the DFA. To keep

// things compact, we would ideally only store

//

//     N = special.max_accel - special.min_accel + 1

//

// items. But state IDs are premultiplied, which means they are not contiguous.

// So in order to take a state ID and index an array of accelerated structures,

// we need to do:

//

//     i = (state_id - special.min_accel) / stride

//

// (N.B. 'stride' is always a power of 2, so the above can be implemented via

// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in

// 2^x=stride.)

//

// Moreover, some of these specialty categories may be empty. For example,

// DFAs are not required to have any match states or any accelerated states.

// In that case, the lower and upper bounds are both set to 0 (the dead state

// ID) and the first `current_state == 0` check subsumes cases where the

// ranges are empty.

//

// Loop unrolling, if applicable, has also been left out of the logic above.

//

// Graphically, the ranges look like this, where asterisks indicate ranges

// that can be empty. Each 'x' is a state.

//

//      quit

//  dead|

//     ||

//     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

//     | |             |    | start |                       |

//     | |-------------|    |-------|                       |

//     |   match*   |          |    |                       |

//     |            |          |    |                       |

//     |            |----------|    |                       |

//     |                accel*      |                       |

//     |                            |                       |

//     |                            |                       |

//     |----------------------------|------------------------

//              special                   non-special*

#[derive(Clone, Copy, Debug)]

pub(crate) struct Special {

    /// The identifier of the last special state in a DFA. A state is special

    /// if and only if its identifier is less than or equal to `max`.

    pub(crate) max: StateID,

    /// The identifier of the quit state in a DFA. (There is no analogous field

    /// for the dead state since the dead state's ID is always zero, regardless

    /// of state ID size.)

    pub(crate) quit_id: StateID,

    /// The identifier of the first match state.

    pub(crate) min_match: StateID,

    /// The identifier of the last match state.

    pub(crate) max_match: StateID,

    /// The identifier of the first accelerated state.

    pub(crate) min_accel: StateID,

    /// The identifier of the last accelerated state.

    pub(crate) max_accel: StateID,

    /// The identifier of the first start state.

    pub(crate) min_start: StateID,

    /// The identifier of the last start state.

    pub(crate) max_start: StateID,

impl Special {

    /// Creates a new set of special ranges for a DFA. All ranges are initially

    /// set to only contain the dead state. This is interpreted as an empty

    /// range.

    #[cfg(feature = "dfa-build")]

    pub(crate) fn new() -> Special {

        Special {

            max: DEAD,

            quit_id: DEAD,

            min_match: DEAD,

            max_match: DEAD,

            min_accel: DEAD,

            max_accel: DEAD,

            min_start: DEAD,

            max_start: DEAD,

    /// Remaps all of the special state identifiers using the function given.

    #[cfg(feature = "dfa-build")]

    pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {

        Special {

            max: map(self.max),

            quit_id: map(self.quit_id),

            min_match: map(self.min_match),

            max_match: map(self.max_match),

            min_accel: map(self.min_accel),

            max_accel: map(self.max_accel),

            min_start: map(self.min_start),

            max_start: map(self.max_start),

    /// Deserialize the given bytes into special state ranges. If the slice

    /// given is not big enough, then this returns an error. Similarly, if

    /// any of the expected invariants around special state ranges aren't

    /// upheld, an error is returned. Note that this does not guarantee that

    /// the information returned is correct.

///

    /// Upon success, this returns the number of bytes read in addition to the

    /// special state IDs themselves.

    pub(crate) fn from_bytes(

        mut slice: &[u8],

    ) -> Result<(Special, usize), DeserializeError> {

        wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;

        let mut nread = 0;

        let mut read_id = |what| -> Result<StateID, DeserializeError> {

            let (id, nr) = wire::try_read_state_id(slice, what)?;

            nread += nr;

            slice = &slice[StateID::SIZE..];

            Ok(id)

};

        let max = read_id("special max id")?;

        let quit_id = read_id("special quit id")?;

        let min_match = read_id("special min match id")?;

        let max_match = read_id("special max match id")?;

        let min_accel = read_id("special min accel id")?;

        let max_accel = read_id("special max accel id")?;

        let min_start = read_id("special min start id")?;

        let max_start = read_id("special max start id")?;

        let special = Special {

            max,

            quit_id,

            min_match,

            max_match,

            min_accel,

            max_accel,

            min_start,

            max_start,

};

        special.validate()?;

        assert_eq!(nread, special.write_to_len());

        Ok((special, nread))

    /// Validate that the information describing special states satisfies

    /// all known invariants.

    pub(crate) fn validate(&self) -> Result<(), DeserializeError> {

        // Check that both ends of the range are DEAD or neither are.

        if self.min_match == DEAD && self.max_match != DEAD {

            err!("min_match is DEAD, but max_match is not");

        if self.min_match != DEAD && self.max_match == DEAD {

            err!("max_match is DEAD, but min_match is not");

        if self.min_accel == DEAD && self.max_accel != DEAD {

            err!("min_accel is DEAD, but max_accel is not");

        if self.min_accel != DEAD && self.max_accel == DEAD {

            err!("max_accel is DEAD, but min_accel is not");

        if self.min_start == DEAD && self.max_start != DEAD {

            err!("min_start is DEAD, but max_start is not");

        if self.min_start != DEAD && self.max_start == DEAD {

            err!("max_start is DEAD, but min_start is not");

        // Check that ranges are well formed.

        if self.min_match > self.max_match {

            err!("min_match should not be greater than max_match");

        if self.min_accel > self.max_accel {

            err!("min_accel should not be greater than max_accel");

        if self.min_start > self.max_start {

            err!("min_start should not be greater than max_start");

        // Check that ranges are ordered with respect to one another.

        if self.matches() && self.quit_id >= self.min_match {

            err!("quit_id should not be greater than min_match");

        if self.accels() && self.quit_id >= self.min_accel {

            err!("quit_id should not be greater than min_accel");

        if self.starts() && self.quit_id >= self.min_start {

            err!("quit_id should not be greater than min_start");

        if self.matches() && self.accels() && self.min_accel < self.min_match {

            err!("min_match should not be greater than min_accel");

        if self.matches() && self.starts() && self.min_start < self.min_match {

            err!("min_match should not be greater than min_start");

        if self.accels() && self.starts() && self.min_start < self.min_accel {

            err!("min_accel should not be greater than min_start");

        // Check that max is at least as big as everything else.

        if self.max < self.quit_id {

            err!("quit_id should not be greater than max");

        if self.max < self.max_match {

            err!("max_match should not be greater than max");

        if self.max < self.max_accel {

            err!("max_accel should not be greater than max");

        if self.max < self.max_start {

            err!("max_start should not be greater than max");

        Ok(())

    /// Validate that the special state information is compatible with the

    /// given state len.

    pub(crate) fn validate_state_len(

        &self,

        len: usize,

        stride2: usize,

    ) -> Result<(), DeserializeError> {

        // We assume that 'validate' has already passed, so we know that 'max'

        // is truly the max. So all we need to check is that the max state ID

        // is less than the state ID len. The max legal value here is len-1,

        // which occurs when there are no non-special states.

        if (self.max.as_usize() >> stride2) >= len {

            err!("max should not be greater than or equal to state length");

        Ok(())

    /// Write the IDs and ranges for special states to the given byte buffer.

    /// The buffer given must have enough room to store all data, otherwise

    /// this will return an error. The number of bytes written is returned

    /// on success. The number of bytes written is guaranteed to be a multiple

    /// of 8.

    pub(crate) fn write_to<E: Endian>(

        &self,

        dst: &mut [u8],

    ) -> Result<usize, SerializeError> {

        use crate::util::wire::write_state_id as write;

        if dst.len() < self.write_to_len() {

            return Err(SerializeError::buffer_too_small("special state ids"));

        let mut nwrite = 0;

        nwrite += write::<E>(self.max, &mut dst[nwrite..]);

        nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);

        nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);

        nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);

        nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);

        nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);

        nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);

        nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);

        assert_eq!(

            self.write_to_len(),

            nwrite,

            "expected to write certain number of bytes",

);

        assert_eq!(

            nwrite % 8,

0,

            "expected to write multiple of 8 bytes for special states",

);

        Ok(nwrite)

    /// Returns the total number of bytes written by `write_to`.

    pub(crate) fn write_to_len(&self) -> usize {

        8 * StateID::SIZE

    /// Sets the maximum special state ID based on the current values. This

    /// should be used once all possible state IDs are set.

    #[cfg(feature = "dfa-build")]

    pub(crate) fn set_max(&mut self) {

        use core::cmp::max;

        self.max = max(

            self.quit_id,

            max(self.max_match, max(self.max_accel, self.max_start)),

);

    /// Sets the maximum special state ID such that starting states are not

    /// considered "special." This also marks the min/max starting states as

    /// DEAD such that 'is_start_state' always returns false, even if the state

    /// is actually a starting state.

///

    /// This is useful when there is no prefilter set. It will avoid

    /// ping-ponging between the hot path in the DFA search code and the start

    /// state handling code, which is typically only useful for executing a

    /// prefilter.

    #[cfg(feature = "dfa-build")]

    pub(crate) fn set_no_special_start_states(&mut self) {

        use core::cmp::max;

        self.max = max(self.quit_id, max(self.max_match, self.max_accel));

        self.min_start = DEAD;

        self.max_start = DEAD;

    /// Returns true if and only if the given state ID is a special state.

    #[inline]

    pub(crate) fn is_special_state(&self, id: StateID) -> bool {

        id <= self.max

    /// Returns true if and only if the given state ID is a dead state.

    #[inline]

    pub(crate) fn is_dead_state(&self, id: StateID) -> bool {

        id == DEAD

    /// Returns true if and only if the given state ID is a quit state.

    #[inline]

    pub(crate) fn is_quit_state(&self, id: StateID) -> bool {

        !self.is_dead_state(id) && self.quit_id == id

    /// Returns true if and only if the given state ID is a match state.

    #[inline]

    pub(crate) fn is_match_state(&self, id: StateID) -> bool {

        !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match

    /// Returns true if and only if the given state ID is an accel state.

    #[inline]

    pub(crate) fn is_accel_state(&self, id: StateID) -> bool {

        !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel

    /// Returns true if and only if the given state ID is a start state.

    #[inline]

    pub(crate) fn is_start_state(&self, id: StateID) -> bool {

        !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start

    /// Returns the total number of match states for a dense table based DFA.

    #[inline]

    pub(crate) fn match_len(&self, stride: usize) -> usize {

        if self.matches() {

            (self.max_match.as_usize() - self.min_match.as_usize() + stride)

                / stride

        } else {

    /// Returns true if and only if there is at least one match state.

    #[inline]

    pub(crate) fn matches(&self) -> bool {

        self.min_match != DEAD

    /// Returns the total number of accel states.

    #[cfg(feature = "dfa-build")]

    pub(crate) fn accel_len(&self, stride: usize) -> usize {

        if self.accels() {

            (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)

                / stride

        } else {

    /// Returns true if and only if there is at least one accel state.

    #[inline]

    pub(crate) fn accels(&self) -> bool {

        self.min_accel != DEAD

    /// Returns true if and only if there is at least one start state.

    #[inline]

    pub(crate) fn starts(&self) -> bool {

        self.min_start != DEAD

Source code

Revision control

Copy as Markdown

Other Tools