match_impl.rs - mozsearch

mozilla-mobile/application-services/components/places/src/match_impl.rs

Enable keyboard shortcuts

Revision control

Copy as Markdown

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use crate::util;

use bitflags::bitflags;

use caseless::Caseless;

use rusqlite::{

    self,

    types::{FromSql, FromSqlError, FromSqlResult, ToSql, ToSqlOutput, ValueRef},

};

use std::borrow::Cow;

const MAX_CHARS_TO_SEARCH_THROUGH: usize = 255;

#[derive(Clone, Copy, PartialEq, Eq, Debug)]

#[repr(u32)]

pub enum MatchBehavior {

    // Match anywhere in each searchable tearm

    Anywhere = 0,

    /// Match first on word boundaries, and if we do not get enough results, then

    /// match anywhere in each searchable term.

    BoundaryAnywhere = 1,

    /// Match on word boundaries in each searchable term.

    Boundary = 2,

    /// Match only the beginning of each search term.

    Beginning = 3,

    /// Match anywhere in each searchable term without doing any transformation

    /// or stripping on the underlying data.

    AnywhereUnmodified = 4,

    /// Match only the beginning of each search term using a case sensitive

    /// comparator

    BeginningCaseSensitive = 5,

impl FromSql for MatchBehavior {

    #[inline]

    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {

        Ok(match value.as_i64()? {

            0 => MatchBehavior::Anywhere,

            1 => MatchBehavior::BoundaryAnywhere,

            2 => MatchBehavior::Boundary,

            3 => MatchBehavior::Beginning,

            4 => MatchBehavior::AnywhereUnmodified,

            5 => MatchBehavior::BeginningCaseSensitive,

            _ => return Err(FromSqlError::InvalidType),

})

impl ToSql for MatchBehavior {

    #[inline]

    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {

        Ok(ToSqlOutput::from(*self as u32))

bitflags! {

    pub struct SearchBehavior: u32 {

        /// Search through history.

        const HISTORY = 1;

        /// Search through bookmarks.

        const BOOKMARK = 1 << 1;

        /// Search through tags.

        const TAG = 1 << 2;

        /// Search through the title of pages.

        const TITLE = 1 << 3;

        /// Search the URL of pages.

        const URL = 1 << 4;

        /// Search for typed pages

        const TYPED = 1 << 5;

        /// Search for javascript: urls

        const JAVASCRIPT = 1 << 6;

        /// Search for open pages (currently not meaningfully implemented)

        const OPENPAGE = 1 << 7;

        /// Use intersection between history, typed, bookmark, tag and openpage

        /// instead of union, when the restrict bit is set.

        const RESTRICT = 1 << 8;

        /// Include search suggestions from the currently selected search provider

        /// (currently not implemented)

        const SEARCHES = 1 << 9;

impl Default for SearchBehavior {

    // See `defaultBehavior` in Desktop's `UrlbarPrefs.jsm`.

    fn default() -> SearchBehavior {

        SearchBehavior::HISTORY

            | SearchBehavior::BOOKMARK

            | SearchBehavior::OPENPAGE

            | SearchBehavior::SEARCHES

impl SearchBehavior {

    #[inline]

    pub fn any() -> Self {

        SearchBehavior::all() & !SearchBehavior::RESTRICT

impl FromSql for SearchBehavior {

    #[inline]

    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {

        SearchBehavior::from_bits(u32::column_result(value)?).ok_or(FromSqlError::InvalidType)

impl ToSql for SearchBehavior {

    #[inline]

    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {

        Ok(ToSqlOutput::from(self.bits()))

/// Convert `c` to lower case if it's an alphabetic ascii character, or completely mangle it if it's

/// not. Just returns `c | 0x20`. I don't know if I actually believe this is faster in a way that

/// matters than the saner version.

#[inline(always)]

fn dubious_to_ascii_lower(c: u8) -> u8 {

    c | 0x20

/// A port of nextSearchCandidate in the desktop places's SQLFunctions.cpp:

///

/// > Scan forward through UTF-8 text until the next potential character that

/// > could match a given codepoint when lower-cased (false positives are okay).

/// > This avoids having to actually parse the UTF-8 text, which is slow.

///

/// It returns the byte index of the first character that could possibly match.

#[inline(always)]

fn next_search_candidate(to_search: &str, search_for: char) -> Option<usize> {

    // If the character we search for is ASCII, then we can scan until we find

    // it or its ASCII uppercase character, modulo the special cases

    // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+212A KELVIN SIGN

    // (which are the only non-ASCII characters that lower-case to ASCII ones).

    // Since false positives are okay, we approximate ASCII lower-casing by

    // bit-ORing with 0x20, for increased performance.

//

    // If the character we search for is *not* ASCII, we can ignore everything

    // that is, since all ASCII characters lower-case to ASCII.

//

    // Because of how UTF-8 uses high-order bits, this will never land us

    // in the middle of a codepoint.

//

    // The assumptions about Unicode made here are verified in test_casing.

    let search_bytes = to_search.as_bytes();

    if (search_for as u32) < 128 {

        // When searching for I or K, we pick out the first byte of the UTF-8

        // encoding of the corresponding special case character, and look for it

        // in the loop below.  For other characters we fall back to 0xff, which

        // is not a valid UTF-8 byte.

        let target = dubious_to_ascii_lower(search_for as u8);

        let special = if target == b'i' {

            0xc4u8

        } else if target == b'k' {

            0xe2u8

        } else {

            0xffu8

};

        // Note: rustc doesn't do great at all on the more idiomatic

        // implementation of this (or below), but it does okay for this.

        let mut ci = 0;

        while ci < search_bytes.len() {

            let cur = search_bytes[ci];

            if dubious_to_ascii_lower(cur) == target || cur == special {

                return Some(ci);

            ci += 1;

    } else {

        let mut ci = 0;

        while ci < search_bytes.len() {

            let cur = search_bytes[ci];

            if (cur & 0x80) != 0 {

                return Some(ci);

            ci += 1;

    None

#[inline(always)]

fn is_ascii_lower_alpha(c: u8) -> bool {

    // Equivalent to (but fewer operations than) `b'a' <= c && c <= b'z'`

    c.wrapping_sub(b'a') <= (b'z' - b'a')

/// port of isOnBoundary from gecko places.

///

/// > Check whether a character position is on a word boundary of a UTF-8 string

/// > (rather than within a word).  We define "within word" to be any position

/// > between [a-zA-Z] and [a-z] -- this lets us match CamelCase words.

/// > TODO: support non-latin alphabets.

#[inline(always)]

fn is_on_boundary(text: &str, index: usize) -> bool {

    if index == 0 {

        return true;

    let bytes = text.as_bytes();

    if is_ascii_lower_alpha(bytes[index]) {

        let prev_lower = dubious_to_ascii_lower(bytes[index - 1]);

        !is_ascii_lower_alpha(prev_lower)

    } else {

        true

/// Returns true if `source` starts with `token` ignoring case.

///

/// Loose port of stringMatch from places, which we've modified to perform more correct case

/// folding (if this turns out to be a perf issue we can always address it then).

#[inline]

fn string_match(token: &str, source: &str) -> bool {

    if source.len() < token.len() {

        return false;

    let mut ti = token.chars().default_case_fold();

    let mut si = source.chars().default_case_fold();

    loop {

        match (ti.next(), si.next()) {

            (None, _) => return true,

            (Some(_), None) => return false,

            (Some(x), Some(y)) => {

                if x != y {

                    return false;

/// This performs single-codepoint case folding. It will do the wrong thing

/// for characters which have lowercase equivalents with multiple characters.

#[inline]

fn char_to_lower_single(c: char) -> char {

    c.to_lowercase().next().unwrap()

/// Read the next codepoint out of `s` and return it's lowercase variant, and the index of the

/// codepoint after it.

#[inline]

fn next_codepoint_lower(s: &str) -> (char, usize) {

    // This is super convoluted, and I wish a more direct way to do it was exposed. (In theory

    // this should be more efficient than this implementation is)

    let mut indices = s.char_indices();

    let (_, next_char) = indices.next().unwrap();

    let next_index = indices

        .next()

        .map(|(index, _)| index)

        .unwrap_or_else(|| s.len());

    (char_to_lower_single(next_char), next_index)

// Port of places `findInString`.

pub fn find_in_string(token: &str, src: &str, only_boundary: bool) -> bool {

    // Place's version has this restriction too

    assert!(!token.is_empty(), "Don't search for an empty string");

    if src.len() < token.len() {

        return false;

    let token_first_char = next_codepoint_lower(token).0;

    // The C++ code is a big ol pointer party, and even indexes with negative numbers

    // in some places. We aren't quite this depraved, so we just use indices into slices.

//

    // There's probably a higher cost to this than usual, and if we had more robust testing

    // (fuzzing, even) it might be worth measuring a version of this that avoids more of the

    // bounds checks.

    let mut cur_offset = 0;

    // Scan forward to the next viable candidate (if any).

    while let Some(src_idx) = next_search_candidate(&src[cur_offset..], token_first_char) {

        if cur_offset + src_idx >= src.len() {

            break;

        cur_offset += src_idx;

        let src_cur = &src[cur_offset..];

        // Check whether the first character in the token matches the character

        // at src_cur. At the same time, get the index of the next character

        // in the source.

        let (src_next_char, next_offset_in_cur) = next_codepoint_lower(src_cur);

        // If it is the first character, and we either don't care about boundaries or

        // we're on one, do the more expensive string matching and return true if it hits.

        if src_next_char == token_first_char

            && (!only_boundary || is_on_boundary(src, cur_offset))

            && string_match(token, src_cur)

            return true;

        cur_offset += next_offset_in_cur;

    false

// Search functions used as function pointers by AutocompleteMatch::Invoke

fn find_anywhere(token: &str, source: &str) -> bool {

    assert!(!token.is_empty(), "Don't search for an empty token");

    find_in_string(token, source, false)

fn find_on_boundary(token: &str, source: &str) -> bool {

    assert!(!token.is_empty(), "Don't search for an empty token");

    find_in_string(token, source, true)

fn find_beginning(token: &str, source: &str) -> bool {

    assert!(!token.is_empty(), "Don't search for an empty token");

    string_match(token, source)

fn find_beginning_case_sensitive(token: &str, source: &str) -> bool {

    assert!(!token.is_empty(), "Don't search for an empty token");

    source.starts_with(token)

// I can't wait for Rust 2018 when lifetime annotations are automatic.

pub struct AutocompleteMatch<'search, 'url, 'title, 'tags> {

    pub search_str: &'search str,

    pub url_str: &'url str,

    pub title_str: &'title str,

    pub tags: &'tags str,

    pub visit_count: u32,

    pub typed: bool,

    pub bookmarked: bool,

    pub open_page_count: u32,

    pub match_behavior: MatchBehavior,

    pub search_behavior: SearchBehavior,

impl<'search, 'url, 'title, 'tags> AutocompleteMatch<'search, 'url, 'title, 'tags> {

    fn get_search_fn(&self) -> fn(&str, &str) -> bool {

        match self.match_behavior {

            MatchBehavior::Anywhere | MatchBehavior::AnywhereUnmodified => find_anywhere,

            MatchBehavior::Beginning => find_beginning,

            MatchBehavior::BeginningCaseSensitive => find_beginning_case_sensitive,

            _ => find_on_boundary,

    fn fixup_url_str<'a>(&self, mut s: &'a str) -> Cow<'a, str> {

        if self.match_behavior != MatchBehavior::AnywhereUnmodified {

            if s.starts_with("http://") {

                s = &s[7..];

            } else if s.starts_with("https://") {

                s = &s[8..];

            } else if s.starts_with("ftp://") {

                s = &s[6..];

        // Bail out early if we don't need to percent decode. It's a

        // little weird that it's measurably faster to check this

        // separately, but whatever.

        if memchr::memchr(b'%', s.as_bytes()).is_none() {

            return Cow::Borrowed(s);

        // TODO: would be nice to decode punycode here too, but for now

        // this is probably fine.

        match percent_encoding::percent_decode(s.as_bytes()).decode_utf8() {

            Err(_) => Cow::Borrowed(s),

            Ok(decoded) => decoded,

    #[inline]

    fn has_behavior(&self, behavior: SearchBehavior) -> bool {

        self.search_behavior.intersects(behavior)

    pub fn invoke(&self) -> bool {

        // We only want to filter javascript: URLs if we are not supposed to search

        // for them, and the search does not start with "javascript:".

        if self.match_behavior == MatchBehavior::AnywhereUnmodified

            && self.url_str.starts_with("javascript:")

            && !self.has_behavior(SearchBehavior::JAVASCRIPT)

            && !self.search_str.starts_with("javascript:")

            return false;

        let matches = if self.has_behavior(SearchBehavior::RESTRICT) {

            (!self.has_behavior(SearchBehavior::HISTORY) || self.visit_count > 0)

                && (!self.has_behavior(SearchBehavior::TYPED) || self.typed)

                && (!self.has_behavior(SearchBehavior::BOOKMARK) || self.bookmarked)

                && (!self.has_behavior(SearchBehavior::TAG) || !self.tags.is_empty())

                && (!self.has_behavior(SearchBehavior::OPENPAGE) || self.open_page_count > 0)

        } else {

            (self.has_behavior(SearchBehavior::HISTORY) && self.visit_count > 0)

                || (self.has_behavior(SearchBehavior::TYPED) && self.typed)

                || (self.has_behavior(SearchBehavior::BOOKMARK) && self.bookmarked)

                || (self.has_behavior(SearchBehavior::TAG) && !self.tags.is_empty())

                || (self.has_behavior(SearchBehavior::OPENPAGE) && self.open_page_count > 0)

};

        if !matches {

            return false;

        let fixed_url = self.fixup_url_str(self.url_str);

        let search_fn = self.get_search_fn();

        let trimmed_url = util::slice_up_to(fixed_url.as_ref(), MAX_CHARS_TO_SEARCH_THROUGH);

        let trimmed_title = util::slice_up_to(self.title_str, MAX_CHARS_TO_SEARCH_THROUGH);

        for token in self.search_str.split_ascii_whitespace() {

            let matches = match (

                self.has_behavior(SearchBehavior::TITLE),

                self.has_behavior(SearchBehavior::URL),

) {

                (true, true) => {

                    (search_fn(token, trimmed_title) || search_fn(token, self.tags))

                        && search_fn(token, trimmed_url)

                (true, false) => search_fn(token, trimmed_title) || search_fn(token, self.tags),

                (false, true) => search_fn(token, trimmed_url),

                (false, false) => {

                    search_fn(token, trimmed_url)

                        || search_fn(token, trimmed_title)

                        || search_fn(token, self.tags)

};

            if !matches {

                return false;

        true

#[cfg(test)]

mod test {

    use super::*;

    #[test]

    fn test_is_ascii_lower_alpha() {

        // just check exhaustively

        for c in 0u8..=255u8 {

            assert_eq!(

                is_ascii_lower_alpha(c),

                c.is_ascii_lowercase(),

                "is_lower_ascii_alpha is wrong for {}",

);

    // Test the various dubious things this code assumes about unicode / ascii text

    // in the name of performance. This is mostly a port of the test_casing gtests in places

    #[test]

    fn test_casing_assumptions() {

        use std::char;

        // Verify the assertion in next_search_candidate that the

        // only non-ASCII characters that lower-case to ASCII ones are:

        //  * U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE

        //  * U+212A KELVIN SIGN

//

        // It also checks that U+0130 is the only single codepoint that lower cases

        // to multiple characters.

        for c in 128..0x11_0000 {

            if let Some(ch) = char::from_u32(c) {

                // Not quite the same (because codepoints aren't characters), but

                // should serve the same purpose.

                let mut li = ch.to_lowercase();

                let lc = li.next().unwrap();

                if c != 304 && c != 8490 {

                    assert!(

                        (lc as u32) >= 128,

                        "Lower case of non-ascii '{}' ({}) was unexpectedly ascii",

ch,

);

                    // This one we added (it's an implicit assumption in the utilities the

                    // places code uses).

                    assert!(

                        li.next().is_none(),

                        "Lower case of '{}' ({}) produced multiple codepoints unexpectedly",

ch,

);

                } else {

                    assert!(

                        (lc as u32) < 128,

                        "Lower case of non-ascii '{}' ({}) was unexpectedly not ascii",

ch,

);

        // Verify the assertion that all ASCII characters lower-case to ASCII.

        for c in 0..128 {

            let ch = char::from_u32(c).unwrap();

            let mut li = ch.to_lowercase();

            let lc = li.next().unwrap();

            assert!(

                li.next().is_none() && (lc as u32) < 128,

                "Lower case of ascii '{}' ({}) wasn't ascii :(",

ch,

);

        for c in (b'a'..=b'z').chain(b'A'..=b'Z') {

            assert_eq!(

                dubious_to_ascii_lower(c),

                c.to_ascii_lowercase(),

                "c: '{}'",

                c as char

);