regex.rs - mozsearch

/*!

A DFA-backed `Regex`.

This module provides [`Regex`], which is defined generically over the

[`Automaton`] trait. A `Regex` implements convenience routines you might have

come to expect, such as finding the start/end of a match and iterating over

all non-overlapping matches. This `Regex` type is limited in its capabilities

to what a DFA can provide. Therefore, APIs involving capturing groups, for

example, are not provided.

Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that

finds the end offset of a match, where as the other is a "reverse" DFA that

find the start offset of a match.

See the [parent module](crate::dfa) for examples.

*/

#[cfg(feature = "alloc")]

use alloc::vec::Vec;

#[cfg(feature = "dfa-build")]

use crate::dfa::dense::BuildError;

use crate::{

    dfa::{automaton::Automaton, dense},

    util::{iter, search::Input},

    Anchored, Match, MatchError,

};

#[cfg(feature = "alloc")]

use crate::{

    dfa::{sparse, StartKind},

    util::search::MatchKind,

};

// When the alloc feature is enabled, the regex type sets its A type parameter

// to default to an owned dense DFA. But without alloc, we set no default. This

// makes things a lot more convenient in the common case, since writing out the

// DFA types is pretty annoying.

//

// Since we have two different definitions but only want to write one doc

// string, we use a macro to capture the doc and other attributes once and then

// repeat them for each definition.

macro_rules! define_regex_type {

    ($(#[$doc:meta])*) => {

        #[cfg(feature = "alloc")]

        $(#[$doc])*

        pub struct Regex<A = dense::OwnedDFA> {

            forward: A,

            reverse: A,

        #[cfg(not(feature = "alloc"))]

        $(#[$doc])*

        pub struct Regex<A> {

            forward: A,

            reverse: A,

};

define_regex_type!(

    /// A regular expression that uses deterministic finite automata for fast

    /// searching.

///

    /// A regular expression is comprised of two DFAs, a "forward" DFA and a

    /// "reverse" DFA. The forward DFA is responsible for detecting the end of

    /// a match while the reverse DFA is responsible for detecting the start

    /// of a match. Thus, in order to find the bounds of any given match, a

    /// forward search must first be run followed by a reverse search. A match

    /// found by the forward DFA guarantees that the reverse DFA will also find

    /// a match.

///

    /// The type of the DFA used by a `Regex` corresponds to the `A` type

    /// parameter, which must satisfy the [`Automaton`] trait. Typically,

    /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a

    /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more

    /// memory but search faster, while sparse DFAs use less memory but search

    /// more slowly.

///

    /// # Crate features

///

    /// Note that despite what the documentation auto-generates, the _only_

    /// crate feature needed to use this type is `dfa-search`. You do _not_

    /// need to enable the `alloc` feature.

///

    /// By default, a regex's automaton type parameter is set to

    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most

    /// in-memory work loads, this is the most convenient type that gives the

    /// best search performance. When the `alloc` feature is disabled, no

    /// default type is used.

///

    /// # When should I use this?

///

    /// Generally speaking, if you can afford the overhead of building a full

    /// DFA for your regex, and you don't need things like capturing groups,

    /// then this is a good choice if you're looking to optimize for matching

    /// speed. Note however that its speed may be worse than a general purpose

    /// regex engine if you don't provide a [`dense::Config::prefilter`] to the

    /// underlying DFA.

///

    /// # Sparse DFAs

///

    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be

    /// used with any kind of DFA. While this crate constructs dense DFAs by

    /// default, it is easy enough to build corresponding sparse DFAs, and then

    /// build a regex from them:

///

    /// ```

    /// use regex_automata::dfa::regex::Regex;

///

    /// // First, build a regex that uses dense DFAs.

    /// let dense_re = Regex::new("foo[0-9]+")?;

///

    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.

    /// let fwd = dense_re.forward().to_sparse()?;

    /// let rev = dense_re.reverse().to_sparse()?;

///

    /// // Third, build a new regex from the constituent sparse DFAs.

    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);

///

    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.

    /// assert_eq!(true, sparse_re.is_match(b"foo123"));

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

///

    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA

    /// more succinctly. (Note though that dense DFAs are still constructed

    /// first internally, and then converted to sparse DFAs, as in the example

    /// above.)

///

    /// ```

    /// use regex_automata::dfa::regex::Regex;

///

    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;

    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.

    /// assert!(sparse_re.is_match(b"foo123"));

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

///

    /// # Fallibility

///

    /// Most of the search routines defined on this type will _panic_ when the

    /// underlying search fails. This might be because the DFA gave up because

    /// it saw a quit byte, whether configured explicitly or via heuristic

    /// Unicode word boundary support, although neither are enabled by default.

    /// Or it might fail because an invalid `Input` configuration is given,

    /// for example, with an unsupported [`Anchored`] mode.

///

    /// If you need to handle these error cases instead of allowing them to

    /// trigger a panic, then the lower level [`Regex::try_search`] provides

    /// a fallible API that never panics.

///

    /// # Example

///

    /// This example shows how to cause a search to terminate if it sees a

    /// `\n` byte, and handle the error returned. This could be useful if, for

    /// example, you wanted to prevent a user supplied pattern from matching

    /// across a line boundary.

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};

///

    /// let re = Regex::builder()

    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))

    ///     .build(r"foo\p{any}+bar")?;

///

    /// let input = Input::new("foo\nbar");

    /// // Normally this would produce a match, since \p{any} contains '\n'.

    /// // But since we instructed the automaton to enter a quit state if a

    /// // '\n' is observed, this produces a match error instead.

    /// let expected = MatchError::quit(b'\n', 3);

    /// let got = re.try_search(&input).unwrap_err();

    /// assert_eq!(expected, got);

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[derive(Clone, Debug)]

);

#[cfg(all(feature = "syntax", feature = "dfa-build"))]

impl Regex {

    /// Parse the given regular expression using the default configuration and

    /// return the corresponding regex.

///

    /// If you want a non-default configuration, then use the [`Builder`] to

    /// set your own configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// let re = Regex::new("foo[0-9]+bar")?;

    /// assert_eq!(

    ///     Some(Match::must(0, 3..14)),

    ///     re.find(b"zzzfoo12345barzzz"),

    /// );

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn new(pattern: &str) -> Result<Regex, BuildError> {

        Builder::new().build(pattern)

    /// Like `new`, but parses multiple patterns into a single "regex set."

    /// This similarly uses the default regex configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;

///

    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");

    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());

    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());

    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());

    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());

    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());

    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());

    /// assert_eq!(None, it.next());

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn new_many<P: AsRef<str>>(

        patterns: &[P],

    ) -> Result<Regex, BuildError> {

        Builder::new().build_many(patterns)

#[cfg(all(feature = "syntax", feature = "dfa-build"))]

impl Regex<sparse::DFA<Vec<u8>>> {

    /// Parse the given regular expression using the default configuration,

    /// except using sparse DFAs, and return the corresponding regex.

///

    /// If you want a non-default configuration, then use the [`Builder`] to

    /// set your own configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// let re = Regex::new_sparse("foo[0-9]+bar")?;

    /// assert_eq!(

    ///     Some(Match::must(0, 3..14)),

    ///     re.find(b"zzzfoo12345barzzz"),

    /// );

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn new_sparse(

        pattern: &str,

    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {

        Builder::new().build_sparse(pattern)

    /// Like `new`, but parses multiple patterns into a single "regex set"

    /// using sparse DFAs. This otherwise similarly uses the default regex

    /// configuration.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;

///

    /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");

    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());

    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());

    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());

    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());

    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());

    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());

    /// assert_eq!(None, it.next());

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn new_many_sparse<P: AsRef<str>>(

        patterns: &[P],

    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {

        Builder::new().build_many_sparse(patterns)

/// Convenience routines for regex construction.

impl Regex<dense::DFA<&'static [u32]>> {

    /// Return a builder for configuring the construction of a `Regex`.

///

    /// This is a convenience routine to avoid needing to import the

    /// [`Builder`] type in common cases.

///

    /// # Example

///

    /// This example shows how to use the builder to disable UTF-8 mode

    /// everywhere.

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::{

    ///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,

    /// };

///

    /// let re = Regex::builder()

    ///     .syntax(syntax::Config::new().utf8(false))

    ///     .thompson(thompson::Config::new().utf8(false))

    ///     .build(r"foo(?-u:[^b])ar.*")?;

    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";

    /// let expected = Some(Match::must(0, 1..9));

    /// let got = re.find(haystack);

    /// assert_eq!(expected, got);

///

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn builder() -> Builder {

        Builder::new()

/// Standard search routines for finding and iterating over matches.

impl<A: Automaton> Regex<A> {

    /// Returns true if and only if this regex matches the given haystack.

///

    /// This routine may short circuit if it knows that scanning future input

    /// will never lead to a different result. In particular, if the underlying

    /// DFA enters a match state or a dead state, then this routine will return

    /// `true` or `false`, respectively, without inspecting any future input.

///

    /// # Panics

///

    /// This routine panics if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the DFA quitting.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search panics, callers cannot know whether a match exists or

    /// not.

///

    /// Use [`Regex::try_search`] if you want to handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::dfa::regex::Regex;

///

    /// let re = Regex::new("foo[0-9]+bar")?;

    /// assert_eq!(true, re.is_match("foo12345bar"));

    /// assert_eq!(false, re.is_match("foobar"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {

        // Not only can we do an "earliest" search, but we can avoid doing a

        // reverse scan too.

        let input = input.into().earliest(true);

        self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()

    /// Returns the start and end offset of the leftmost match. If no match

    /// exists, then `None` is returned.

///

    /// # Panics

///

    /// This routine panics if the search could not complete. This can occur

    /// in a number of circumstances:

///

    /// * The configuration of the DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the DFA quitting.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search panics, callers cannot know whether a match exists or

    /// not.

///

    /// Use [`Regex::try_search`] if you want to handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// // Greediness is applied appropriately.

    /// let re = Regex::new("foo[0-9]+")?;

    /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));

///

    /// // Even though a match is found after reading the first byte (`a`),

    /// // the default leftmost-first match semantics demand that we find the

    /// // earliest match that prefers earlier parts of the pattern over latter

    /// // parts.

    /// let re = Regex::new("abc|a")?;

    /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {

        self.try_search(&input.into()).unwrap()

    /// Returns an iterator over all non-overlapping leftmost matches in the

    /// given bytes. If no match exists, then the iterator yields no elements.

///

    /// This corresponds to the "standard" regex search iterator.

///

    /// # Panics

///

    /// If the search returns an error during iteration, then iteration

    /// panics. See [`Regex::find`] for the panic conditions.

///

    /// Use [`Regex::try_search`] with

    /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to

    /// handle these error conditions.

///

    /// # Example

///

    /// ```

    /// use regex_automata::{Match, dfa::regex::Regex};

///

    /// let re = Regex::new("foo[0-9]+")?;

    /// let text = "foo1 foo12 foo123";

    /// let matches: Vec<Match> = re.find_iter(text).collect();

    /// assert_eq!(matches, vec![

    ///     Match::must(0, 0..4),

    ///     Match::must(0, 5..10),

    ///     Match::must(0, 11..17),

    /// ]);

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    #[inline]

    pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(

        &'r self,

        input: I,

    ) -> FindMatches<'r, 'h, A> {

        let it = iter::Searcher::new(input.into());

        FindMatches { re: self, it }

/// Lower level fallible search routines that permit controlling where the

/// search starts and ends in a particular sequence.

impl<A: Automaton> Regex<A> {

    /// Returns the start and end offset of the leftmost match. If no match

    /// exists, then `None` is returned.

///

    /// This is like [`Regex::find`] but with two differences:

///

    /// 1. It is not generic over `Into<Input>` and instead accepts a

    /// `&Input`. This permits reusing the same `Input` for multiple searches

    /// without needing to create a new one. This _may_ help with latency.

    /// 2. It returns an error if the search could not complete where as

    /// [`Regex::find`] will panic.

///

    /// # Errors

///

    /// This routine errors if the search could not complete. This can occur

    /// in the following circumstances:

///

    /// * The configuration of the DFA may permit it to "quit" the search.

    /// For example, setting quit bytes or enabling heuristic support for

    /// Unicode word boundaries. The default configuration does not enable any

    /// option that could result in the DFA quitting.

    /// * When the provided `Input` configuration is not supported. For

    /// example, by providing an unsupported anchor mode.

///

    /// When a search returns an error, callers cannot know whether a match

    /// exists or not.

    #[inline]

    pub fn try_search(

        &self,

        input: &Input<'_>,

    ) -> Result<Option<Match>, MatchError> {

        let (fwd, rev) = (self.forward(), self.reverse());

        let end = match fwd.try_search_fwd(input)? {

            None => return Ok(None),

            Some(end) => end,

};

        // This special cases an empty match at the beginning of the search. If

        // our end matches our start, then since a reverse DFA can't match past

        // the start, it must follow that our starting position is also our end

        // position. So short circuit and skip the reverse search.

        if input.start() == end.offset() {

            return Ok(Some(Match::new(

                end.pattern(),

                end.offset()..end.offset(),

            )));

        // We can also skip the reverse search if we know our search was

        // anchored. This occurs either when the input config is anchored or

        // when we know the regex itself is anchored. In this case, we know the

        // start of the match, if one is found, must be the start of the

        // search.

        if self.is_anchored(input) {

            return Ok(Some(Match::new(

                end.pattern(),

                input.start()..end.offset(),

            )));

        // N.B. I have tentatively convinced myself that it isn't necessary

        // to specify the specific pattern for the reverse search since the

        // reverse search will always find the same pattern to match as the

        // forward search. But I lack a rigorous proof. Why not just provide

        // the pattern anyway? Well, if it is needed, then leaving it out

        // gives us a chance to find a witness. (Also, if we don't need to

        // specify the pattern, then we don't need to build the reverse DFA

        // with 'starts_for_each_pattern' enabled.)

//

        // We also need to be careful to disable 'earliest' for the reverse

        // search, since it could be enabled for the forward search. In the

        // reverse case, to satisfy "leftmost" criteria, we need to match

        // as much as we can. We also need to be careful to make the search

        // anchored. We don't want the reverse search to report any matches

        // other than the one beginning at the end of our forward search.

        let revsearch = input

            .clone()

            .span(input.start()..end.offset())

            .anchored(Anchored::Yes)

            .earliest(false);

        let start = rev

            .try_search_rev(&revsearch)?

            .expect("reverse search must match if forward search does");

        assert_eq!(

            start.pattern(),

            end.pattern(),

            "forward and reverse search must match same pattern",

);

        assert!(start.offset() <= end.offset());

        Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))

    /// Returns true if either the given input specifies an anchored search

    /// or if the underlying DFA is always anchored.

    fn is_anchored(&self, input: &Input<'_>) -> bool {

        match input.get_anchored() {

            Anchored::No => self.forward().is_always_start_anchored(),

            Anchored::Yes | Anchored::Pattern(_) => true,

/// Non-search APIs for querying information about the regex and setting a

/// prefilter.

impl<A: Automaton> Regex<A> {

    /// Return the underlying DFA responsible for forward matching.

///

    /// This is useful for accessing the underlying DFA and converting it to

    /// some other format or size. See the [`Builder::build_from_dfas`] docs

    /// for an example of where this might be useful.

    pub fn forward(&self) -> &A {

        &self.forward

    /// Return the underlying DFA responsible for reverse matching.

///

    /// This is useful for accessing the underlying DFA and converting it to

    /// some other format or size. See the [`Builder::build_from_dfas`] docs

    /// for an example of where this might be useful.

    pub fn reverse(&self) -> &A {

        &self.reverse

    /// Returns the total number of patterns matched by this regex.

///

    /// # Example

///

    /// ```

    /// # if cfg!(miri) { return Ok(()); } // miri takes too long

    /// use regex_automata::dfa::regex::Regex;

///

    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;

    /// assert_eq!(3, re.pattern_len());

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn pattern_len(&self) -> usize {

        assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());

        self.forward().pattern_len()

/// An iterator over all non-overlapping matches for an infallible search.

///

/// The iterator yields a [`Match`] value until no more matches could be found.

/// If the underlying regex engine returns an error, then a panic occurs.

///

/// The type parameters are as follows:

///

/// * `A` represents the type of the underlying DFA that implements the

/// [`Automaton`] trait.

///

/// The lifetime parameters are as follows:

///

/// * `'h` represents the lifetime of the haystack being searched.

/// * `'r` represents the lifetime of the regex object itself.

///

/// This iterator can be created with the [`Regex::find_iter`] method.

#[derive(Debug)]

pub struct FindMatches<'r, 'h, A> {

    re: &'r Regex<A>,

    it: iter::Searcher<'h>,

impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {

    type Item = Match;

    #[inline]

    fn next(&mut self) -> Option<Match> {

        let FindMatches { re, ref mut it } = *self;

        it.advance(|input| re.try_search(input))

/// A builder for a regex based on deterministic finite automatons.

///

/// This builder permits configuring options for the syntax of a pattern, the

/// NFA construction, the DFA construction and finally the regex searching

/// itself. This builder is different from a general purpose regex builder in

/// that it permits fine grain configuration of the construction process. The

/// trade off for this is complexity, and the possibility of setting a

/// configuration that might not make sense. For example, there are two

/// different UTF-8 modes:

///

/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls

/// whether the pattern itself can contain sub-expressions that match invalid

/// UTF-8.

/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls

/// how the regex iterators themselves advance the starting position of the

/// next search when a match with zero length is found.

///

/// Generally speaking, callers will want to either enable all of these or

/// disable all of these.

///

/// Internally, building a regex requires building two DFAs, where one is

/// responsible for finding the end of a match and the other is responsible

/// for finding the start of a match. If you only need to detect whether

/// something matched, or only the end of a match, then you should use a

/// [`dense::Builder`] to construct a single DFA, which is cheaper than

/// building two DFAs.

///

/// # Build methods

///

/// This builder has a few "build" methods. In general, it's the result of

/// combining the following parameters:

///

/// * Building one or many regexes.

/// * Building a regex with dense or sparse DFAs.

///

/// The simplest "build" method is [`Builder::build`]. It accepts a single

/// pattern and builds a dense DFA using `usize` for the state identifier

/// representation.

///

/// The most general "build" method is [`Builder::build_many`], which permits

/// building a regex that searches for multiple patterns simultaneously while

/// using a specific state identifier representation.

///

/// The most flexible "build" method, but hardest to use, is

/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is

/// just a pair of DFAs, and this method allows you to specify those DFAs

/// exactly.

///

/// # Example

///

/// This example shows how to disable UTF-8 mode in the syntax and the regex

/// itself. This is generally what you want for matching on arbitrary bytes.

///

/// ```

/// # if cfg!(miri) { return Ok(()); } // miri takes too long

/// use regex_automata::{

///     dfa::regex::Regex, nfa::thompson, util::syntax, Match,

/// };

///

/// let re = Regex::builder()

///     .syntax(syntax::Config::new().utf8(false))

///     .thompson(thompson::Config::new().utf8(false))

///     .build(r"foo(?-u:[^b])ar.*")?;

/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";

/// let expected = Some(Match::must(0, 1..9));

/// let got = re.find(haystack);

/// assert_eq!(expected, got);

/// // Notice that `(?-u:[^b])` matches invalid UTF-8,

/// // but the subsequent `.*` does not! Disabling UTF-8

/// // on the syntax permits this.

/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);

///

/// # Ok::<(), Box<dyn std::error::Error>>(())

/// ```

#[derive(Clone, Debug)]

pub struct Builder {

    #[cfg(feature = "dfa-build")]

    dfa: dense::Builder,

impl Builder {

    /// Create a new regex builder with the default configuration.

    pub fn new() -> Builder {

        Builder {

            #[cfg(feature = "dfa-build")]

            dfa: dense::Builder::new(),

    /// Build a regex from the given pattern.

///

    /// If there was a problem parsing or compiling the pattern, then an error

    /// is returned.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {

        self.build_many(&[pattern])

    /// Build a regex from the given pattern using sparse DFAs.

///

    /// If there was a problem parsing or compiling the pattern, then an error

    /// is returned.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn build_sparse(

        &self,

        pattern: &str,

    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {

        self.build_many_sparse(&[pattern])

    /// Build a regex from the given patterns.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn build_many<P: AsRef<str>>(

        &self,

        patterns: &[P],

    ) -> Result<Regex, BuildError> {

        let forward = self.dfa.build_many(patterns)?;

        let reverse = self

            .dfa

            .clone()

            .configure(

                dense::Config::new()

                    .prefilter(None)

                    .specialize_start_states(false)

                    .start_kind(StartKind::Anchored)

                    .match_kind(MatchKind::All),

            .thompson(crate::nfa::thompson::Config::new().reverse(true))

            .build_many(patterns)?;

        Ok(self.build_from_dfas(forward, reverse))

    /// Build a sparse regex from the given patterns.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn build_many_sparse<P: AsRef<str>>(

        &self,

        patterns: &[P],

    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {

        let re = self.build_many(patterns)?;

        let forward = re.forward().to_sparse()?;

        let reverse = re.reverse().to_sparse()?;

        Ok(self.build_from_dfas(forward, reverse))

    /// Build a regex from its component forward and reverse DFAs.

///

    /// This is useful when deserializing a regex from some arbitrary

    /// memory region. This is also useful for building regexes from other

    /// types of DFAs.

///

    /// If you're building the DFAs from scratch instead of building new DFAs

    /// from other DFAs, then you'll need to make sure that the reverse DFA is

    /// configured correctly to match the intended semantics. Namely:

///

    /// * It should be anchored.

    /// * It should use [`MatchKind::All`] semantics.

    /// * It should match in reverse.

    /// * Otherwise, its configuration should match the forward DFA.

///

    /// If these conditions aren't satisfied, then the behavior of searches is

    /// unspecified.

///

    /// Note that when using this constructor, no configuration is applied.

    /// Since this routine provides the DFAs to the builder, there is no

    /// opportunity to apply other configuration options.

///

    /// # Example

///

    /// This example is a bit a contrived. The usual use of these methods

    /// would involve serializing `initial_re` somewhere and then deserializing

    /// it later to build a regex. But in this case, we do everything in

    /// memory.

///

    /// ```

    /// use regex_automata::dfa::regex::Regex;

///

    /// let initial_re = Regex::new("foo[0-9]+")?;

    /// assert_eq!(true, initial_re.is_match(b"foo123"));

///

    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());

    /// let re = Regex::builder().build_from_dfas(fwd, rev);

    /// assert_eq!(true, re.is_match(b"foo123"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

///

    /// This example shows how to build a `Regex` that uses sparse DFAs instead

    /// of dense DFAs without using one of the convenience `build_sparse`

    /// routines:

///

    /// ```

    /// use regex_automata::dfa::regex::Regex;

///

    /// let initial_re = Regex::new("foo[0-9]+")?;

    /// assert_eq!(true, initial_re.is_match(b"foo123"));

///

    /// let fwd = initial_re.forward().to_sparse()?;

    /// let rev = initial_re.reverse().to_sparse()?;

    /// let re = Regex::builder().build_from_dfas(fwd, rev);

    /// assert_eq!(true, re.is_match(b"foo123"));

    /// # Ok::<(), Box<dyn std::error::Error>>(())

    /// ```

    pub fn build_from_dfas<A: Automaton>(

        &self,

        forward: A,

        reverse: A,

    ) -> Regex<A> {

        Regex { forward, reverse }

    /// Set the syntax configuration for this builder using

    /// [`syntax::Config`](crate::util::syntax::Config).

///

    /// This permits setting things like case insensitivity, Unicode and multi

    /// line mode.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn syntax(

        &mut self,

        config: crate::util::syntax::Config,

    ) -> &mut Builder {

        self.dfa.syntax(config);

        self

    /// Set the Thompson NFA configuration for this builder using

    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).

///

    /// This permits setting things like whether additional time should be

    /// spent shrinking the size of the NFA.

    #[cfg(all(feature = "syntax", feature = "dfa-build"))]

    pub fn thompson(

        &mut self,

        config: crate::nfa::thompson::Config,

    ) -> &mut Builder {

        self.dfa.thompson(config);

        self

    /// Set the dense DFA compilation configuration for this builder using

    /// [`dense::Config`](dense::Config).

///

    /// This permits setting things like whether the underlying DFAs should

    /// be minimized.

    #[cfg(feature = "dfa-build")]

    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {

        self.dfa.configure(config);

        self

impl Default for Builder {

    fn default() -> Builder {

        Builder::new()

Source code

Revision control

Copy as Markdown

Other Tools