UrlbarTokenizer.sys.mjs

firefox-main/browser/components/urlbar/UrlbarTokenizer.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox :: Address Bar

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**

 * This module exports a tokenizer to be used by the urlbar model.

 * Emitted tokens are objects in the shape { type, value }, where type is one

 * of UrlbarTokenizer.TYPE.

*/

const lazy = {};

ChromeUtils.defineESModuleGetters(lazy, {

  UrlbarPrefs: "moz-src:///browser/components/urlbar/UrlbarPrefs.sys.mjs",

  UrlbarUtils: "moz-src:///browser/components/urlbar/UrlbarUtils.sys.mjs",

  PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",

  UrlUtils: "resource://gre/modules/UrlUtils.sys.mjs",

});

ChromeUtils.defineLazyGetter(lazy, "logger", () =>

  lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })

);

ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () {

  return new Localization(["browser/browser.ftl"]);

});

/**

 * @typedef UrlbarSearchStringTokenData

 * @property {Values<typeof lazy.UrlbarTokenizer.TYPE>} type

 *   The type of the token.

 * @property {string} value

 *   The value of the token.

 * @property {string} lowerCaseValue

 *   The lower case version of the value.

*/

/**

 * This Map stores key-value pairs where each key is a restrict token

 * and each value is an array containing the localized keyword and the

 * english keyword.

 * For example,

 * "*" maps to "Bookmarks" for english locales

 * "*" maps to "Marcadores, Bookmarks" for es-ES

 * @type {Map<string, string[]>}

*/

let tokenToKeywords = new Map();

export var UrlbarTokenizer = {

  TYPE: {

    TEXT: 1,

    // `looksLikeOrigin()` returned a value for this token that was neither

    // `LOOKS_LIKE_ORIGIN.NONE` nor `LOOKS_LIKE_ORIGIN.OTHER`. It sure looks

    // like an origin.

    POSSIBLE_ORIGIN: 2,

    POSSIBLE_URL: 3, // Consumers should still check this with a fixup.

    RESTRICT_HISTORY: 4,

    RESTRICT_BOOKMARK: 5,

    RESTRICT_TAG: 6,

    RESTRICT_OPENPAGE: 7,

    RESTRICT_SEARCH: 8,

    RESTRICT_TITLE: 9,

    RESTRICT_URL: 10,

    RESTRICT_ACTION: 11,

    // `looksLikeOrigin()` returned `LOOKS_LIKE_ORIGIN.OTHER` for this token. It

    // may or may not be an origin.

    POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED: 12,

},

  // The special characters below can be typed into the urlbar to restrict

  // the search to a certain category, like history, bookmarks or open pages; or

  // to force a match on just the title or url.

  // These restriction characters can be typed alone, or at word boundaries,

  // provided their meaning cannot be confused, for example # could be present

  // in a valid url, and thus it should not be interpreted as a restriction.

  RESTRICT: {

    HISTORY: "^",

    BOOKMARK: "*",

    TAG: "+",

    OPENPAGE: "%",

    SEARCH: "?",

    TITLE: "#",

    URL: "$",

    ACTION: ">",

},

  // The keys of characters in RESTRICT that will enter search mode.

  get SEARCH_MODE_RESTRICT() {

    const keys = [

      this.RESTRICT.HISTORY,

      this.RESTRICT.BOOKMARK,

      this.RESTRICT.OPENPAGE,

      this.RESTRICT.SEARCH,

];

    if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) {

      keys.push(this.RESTRICT.ACTION);

    return new Set(keys);

},

  async loadL10nRestrictKeywords() {

    let l10nKeywords = await lazy.gFluentStrings.formatValues(

      lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {

        let name = lazy.UrlbarUtils.getResultSourceName(mode.source);

        return { id: `urlbar-search-mode-${name}` };

})

);

    let englishSearchStrings = new Localization([

      "preview/enUS-searchFeatures.ftl",

]);

    let englishKeywords = await englishSearchStrings.formatValues(

      lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {

        let name = lazy.UrlbarUtils.getResultSourceName(mode.source);

        return { id: `urlbar-search-mode-${name}-en` };

})

);

    for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) {

      let uniqueKeywords = [

        ...new Set([l10nKeywords.shift(), englishKeywords.shift()]),

];

      tokenToKeywords.set(restrict, uniqueKeywords);

},

/**

   * Gets the cached localized restrict keywords. If keywords are not cached

   * fetch the localized keywords first and then return the keywords.

*/

  async getL10nRestrictKeywords() {

    if (tokenToKeywords.size === 0) {

      await this.loadL10nRestrictKeywords();

    return tokenToKeywords;

},

/**

   * Tokenizes the searchString from a UrlbarQueryContext.

   * @param {object} context

   * @param {string} context.searchString

   * @param {string} [context.searchMode]

   * @param {string} context.trimmedSearchString

   * @returns {UrlbarSearchStringTokenData[]}

   *  The tokens associated with the query.

*/

  tokenize(context) {

    lazy.logger.debug("Tokenizing search string", {

      searchString: context.searchString,

});

    if (!context.trimmedSearchString) {

      return [];

    let unfiltered = splitString(context);

    return filterTokens(unfiltered);

},

/**

   * Given a token, tells if it's a restriction token.

   * @param {object} token

   *   The token to check.

   * @returns {boolean} Whether the token is a restriction character.

*/

  isRestrictionToken(token) {

    return (

      token &&

      token.type >= this.TYPE.RESTRICT_HISTORY &&

      token.type <= this.TYPE.RESTRICT_URL

);

},

};

const CHAR_TO_TYPE_MAP = new Map(

  Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [

    char,

    UrlbarTokenizer.TYPE[`RESTRICT_${type}`],

])

);

/**

 * Given a queryContext object, splits its searchString into string tokens.

 * @param {object} context

 * @param {string} context.searchString

 * @param {string} [context.searchMode]

 * @returns {string[]} An array of string tokens.

*/

function splitString({ searchString, searchMode }) {

  // The first step is splitting on unicode whitespaces. We ignore whitespaces

  // if the search string starts with "data:", to better support Web developers

  // and compatiblity with other browsers.

  let trimmed = searchString.trim();

  let tokens;

  if (trimmed.startsWith("data:")) {

    tokens = [trimmed];

  } else if (trimmed.length < 500) {

    tokens = trimmed.split(lazy.UrlUtils.REGEXP_SPACES);

  } else {

    // If the string is very long, tokenizing all of it would be expensive. So

    // we only tokenize a part of it, then let the last token become a

    // catch-all.

    tokens = trimmed.substring(0, 500).split(lazy.UrlUtils.REGEXP_SPACES);

    tokens[tokens.length - 1] += trimmed.substring(500);

  if (!tokens.length) {

    return tokens;

  // If there is no separate restriction token, it's possible we have to split

  // a token, if it's the first one and it includes a leading restriction char

  // or it's the last one and it includes a trailing restriction char.

  // This allows to not require the user to add artificial whitespaces to

  // enforce restrictions, for example typing questions would restrict to

  // search results.

  const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));

  const firstToken = tokens[0];

  const isFirstTokenAKeyword =

    !Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) &&

    lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken);

  if (hasRestrictionToken || isFirstTokenAKeyword) {

    return tokens;

  // Check for an unambiguous restriction char at the beginning of the first

  // token.

  if (

    CHAR_TO_TYPE_MAP.has(firstToken[0]) &&

    !lazy.UrlUtils.REGEXP_PERCENT_ENCODED_START.test(firstToken) &&

    !searchMode

) {

    tokens[0] = firstToken.substring(1);

    tokens.splice(0, 0, firstToken[0]);

    return tokens;

  return tokens;

/**

 * Given an array of unfiltered tokens, this function filters them and converts

 * to token objects with a type.

 * @param {Array} tokens

 *        An array of strings, representing search tokens.

 * @returns {Array} An array of token objects.

 * Note: restriction characters are only considered if they appear at the start

 *       or at the end of the tokens list. In case of restriction characters

 *       conflict, the most external ones win. Leading ones win over trailing

 *       ones. Discarded restriction characters are considered text.

*/

function filterTokens(tokens) {

  let filtered = [];

  let restrictions = [];

  const isFirstTokenAKeyword =

    !Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) &&

    lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]);

  for (let i = 0; i < tokens.length; ++i) {

    let token = tokens[i];

    let tokenObj = {

      value: token,

      lowerCaseValue: token.toLocaleLowerCase(),

      type: UrlbarTokenizer.TYPE.TEXT,

};

    // For privacy reasons, we don't want to send a data (or other kind of) URI

    // to a search engine. So we want to parse any single long token below.

    if (tokens.length > 1 && token.length > 500) {

      filtered.push(tokenObj);

      break;

    if (isFirstTokenAKeyword) {

      filtered.push(tokenObj);

      continue;

    let restrictionType = CHAR_TO_TYPE_MAP.get(token);

    if (restrictionType) {

      restrictions.push({ index: i, type: restrictionType });

    } else {

      let looksLikeOrigin = lazy.UrlUtils.looksLikeOrigin(token);

      if (

        looksLikeOrigin == lazy.UrlUtils.LOOKS_LIKE_ORIGIN.OTHER &&

        lazy.UrlbarPrefs.get("allowSearchSuggestionsForSimpleOrigins")

) {

        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED;

      } else if (looksLikeOrigin != lazy.UrlUtils.LOOKS_LIKE_ORIGIN.NONE) {

        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;

      } else if (lazy.UrlUtils.looksLikeUrl(token, { requirePath: true })) {

        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;

    filtered.push(tokenObj);

  // Handle restriction characters.

  if (restrictions.length) {

    // We can apply two kind of restrictions: type (bookmark, search, ...) and

    // matching (url, title). These kind of restrictions can be combined, but we

    // can only have one restriction per kind.

    let matchingRestrictionFound = false;

    let typeRestrictionFound = false;

    function assignRestriction(r) {

      if (r && !(matchingRestrictionFound && typeRestrictionFound)) {

        if (

            UrlbarTokenizer.TYPE.RESTRICT_TITLE,

            UrlbarTokenizer.TYPE.RESTRICT_URL,

          ].includes(r.type)

) {

          if (!matchingRestrictionFound) {

            matchingRestrictionFound = true;

            filtered[r.index].type = r.type;

            return true;

        } else if (!typeRestrictionFound) {

          typeRestrictionFound = true;

          filtered[r.index].type = r.type;

          return true;

      return false;

    // Look at the first token.

    let found = assignRestriction(restrictions.find(r => r.index == 0));

    if (found) {

      // If the first token was assigned, look at the next one.

      assignRestriction(restrictions.find(r => r.index == 1));

    // Then look at the last token.

    let lastIndex = tokens.length - 1;

    found = assignRestriction(restrictions.find(r => r.index == lastIndex));

    if (found) {

      // If the last token was assigned, look at the previous one.

      assignRestriction(restrictions.find(r => r.index == lastIndex - 1));

  lazy.logger.info("Filtered Tokens", filtered);

  return filtered;