SearchBrowsingHistoryDomainBoost.sys.mjs

firefox-main/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Machine Learning: General

Revision control

Copy as Markdown

Other Tools

/**

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

*/

/**

 * SearchBrowsingHistoryDomainBoost

 * Temporary heuristic for general-category queries (games, movies, news, etc.)

 * when semantic embeddings over title/description are insufficient.

 * Safe to remove once richer embeddings or better intent classification lands.

*/

export const CATEGORIES_JSON = {

  language: "en",

  categories: [

      id: "games",

      terms: [

        "game",

        "games",

        "video game",

        "video games",

        "pc games",

        "console games",

],

      domains: [

        "store.steampowered.com",

        "roblox.com",

        "ign.com",

        "gamespot.com",

        "polygon.com",

        "metacritic.com",

        "epicgames.com",

        "store.playstation.com",

        "xbox.com",

        "nintendo.com",

],

},

      id: "movies",

      terms: ["movie", "movies", "film", "films", "cinema"],

      domains: [

        "imdb.com",

        "rottentomatoes.com",

        "metacritic.com",

        "letterboxd.com",

        "netflix.com",

        "primevideo.com",

        "disneyplus.com",

        "hulu.com",

        "max.com",

],

},

      id: "tv",

      terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"],

      domains: [

        "imdb.com",

        "rottentomatoes.com",

        "metacritic.com",

        "tvmaze.com",

        "thetvdb.com",

        "netflix.com",

        "primevideo.com",

        "disneyplus.com",

        "hulu.com",

        "max.com",

],

},

      id: "books",

      terms: ["book", "books", "novel", "novels"],

      domains: [

        "goodreads.com",

        "gutenberg.org",

        "openlibrary.org",

        "barnesandnoble.com",

        "indigo.ca",

],

},

      id: "anime",

      terms: ["anime", "manga"],

      domains: [

        "myanimelist.net",

        "anilist.co",

        "kitsu.app",

        "crunchyroll.com",

],

},

      id: "music",

      terms: ["music", "song", "songs", "album", "albums", "lyrics"],

      domains: [

        "spotify.com",

        "music.apple.com",

        "soundcloud.com",

        "bandcamp.com",

        "music.youtube.com",

],

},

      id: "podcasts",

      terms: ["podcast", "podcasts"],

      domains: [

        "podcasts.apple.com",

        "overcast.fm",

        "pocketcasts.com",

        "castbox.fm",

],

},

      id: "papers_research",

      terms: [

        "paper",

        "papers",

        "research paper",

        "research papers",

        "academic paper",

        "academic papers",

        "journal",

        "journals",

        "study",

        "studies",

        "publication",

        "publications",

],

      domains: [

        "scholar.google.com",

        "arxiv.org",

        "semanticscholar.org",

        "pubmed.ncbi.nlm.nih.gov",

        "researchgate.net",

        "ieeexplore.ieee.org",

        "dl.acm.org",

        "springer.com",

        "nature.com",

        "science.org",

],

},

      id: "tech_news",

      terms: ["tech news", "technology news", "startup news"],

      domains: [

        "theverge.com",

        "techcrunch.com",

        "wired.com",

        "arstechnica.com",

        "engadget.com",

],

},

      id: "finance_news",

      terms: ["finance news", "business news", "market news", "stock news"],

      domains: [

        "bloomberg.com",

        "wsj.com",

        "ft.com",

        "reuters.com",

        "cnbc.com",

],

},

      id: "news",

      terms: [

        "news",

        "headline",

        "headlines",

        "breaking news",

        "world news",

        "latest news",

],

      domains: [

        "reuters.com",

        "apnews.com",

        "bbc.com",

        "cnn.com",

        "nytimes.com",

        "theguardian.com",

        "washingtonpost.com",

        "aljazeera.com",

        "npr.org",

        "wsj.com",

        "bloomberg.com",

        "ft.com",

],

},

      id: "recipes",

      terms: [

        "recipe",

        "recipes",

        "cooking",

        "food",

        "dinner ideas",

        "meal prep",

],

      domains: [

        "allrecipes.com",

        "seriouseats.com",

        "foodnetwork.com",

        "bbcgoodfood.com",

        "epicurious.com",

        "nytcooking.com",

],

},

      id: "travel",

      terms: ["travel", "hotels", "places", "destinations", "things to do"],

      domains: [

        "tripadvisor.com",

        "booking.com",

        "expedia.com",

        "airbnb.com",

        "lonelyplanet.com",

],

},

],

};

/**

 * Normalizes a query string into a lowercase, space-separated form suitable for matching

 * and comparison.

 * @param {string} s

 * @returns {string}

*/

function normalizeQuery(s) {

  return (s || "")

    .toLowerCase()

    .replace(/[^\p{L}\p{N}]+/gu, " ")

    .replace(/\s+/g, " ")

    .trim();

/**

 * Returns the matched category domains if searchTerm looks like a general category query.

 * Uses phrase matching on normalized query string.

 * @param {string} searchTerm

 * @param {object} [categoriesJson=CATEGORIES_JSON]

 * @returns {string[]|null}

*/

export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) {

  const q = ` ${normalizeQuery(searchTerm)} `;

  if (!q.trim()) {

    return null;

  for (const cat of categoriesJson.categories) {

    for (const t of cat.terms) {

      // Pad with spaces to enable whole-token phrase matching via includes.

      const tt = ` ${normalizeQuery(t)} `;

      if (tt.trim() && q.includes(tt)) {

        return cat.domains;

  return null;

/**

 * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging

 * to the given root domains and their `www` variants.

 * @param {string[]} domains

 * @returns {{ where: string, params: object }}

*/

function buildDomainUrlWhere(domains) {

  const clauses = [];

  const params = {};

  let i = 0;

  for (const raw of domains || []) {

    const d = String(raw).toLowerCase();

    if (!d) {

      continue;

    // - https://domain/...

    // - https://www.domain/...

    params[`d${i}`] = `%://${d}/%`;

    clauses.push(`lower(url) LIKE :d${i++}`);

    params[`d${i}`] = `%://www.${d}/%`;

    clauses.push(`lower(url) LIKE :d${i++}`);

  return {

    where: clauses.length ? `(${clauses.join(" OR ")})` : "0",

    params,

};

/**

 * Domain-filtered moz_places query (time-windowed).

 * @param {object} params

 * @param {object} params.conn

 * @param {string[]} params.domains

 * @param {number|null} params.startTs

 * @param {number|null} params.endTs

 * @param {number} params.historyLimit

 * @param {Function} params.buildHistoryRow

 * @returns {Promise<object[]>}

*/

export async function searchByDomains({

  conn,

  domains,

  startTs,

  endTs,

  historyLimit,

  buildHistoryRow,

}) {

  if (!conn || !Array.isArray(domains) || !domains.length) {

    return [];

  const { where, params } = buildDomainUrlWhere(domains);

  const results = await conn.executeCached(

      SELECT id,

             title,

             url,

             NULL AS distance,

             visit_count,

             frecency,

             last_visit_date,

             preview_image_url

      FROM moz_places

      WHERE frecency <> 0

        AND (:startTs IS NULL OR last_visit_date >= :startTs)

        AND (:endTs IS NULL OR last_visit_date <= :endTs)

        AND ${where}

      ORDER BY last_visit_date DESC, frecency DESC

      LIMIT :limit

`,

      startTs,

      endTs,

      limit: historyLimit,

      ...params,

);

  const rows = [];

  for (const row of results) {

    rows.push(await buildHistoryRow(row));

  return rows;

/**

 * Merge two result lists, keeping `primary` order, then topping up from `secondary`,

 * while de-duping by url (fallback to id).

 * @param {object[]} primary

 * @param {object[]} secondary

 * @param {number} limit

 * @returns {object[]}

*/

export function mergeDedupe(primary, secondary, limit) {

  const seen = new Set();

  const out = [];

  const keyOf = r => r?.url || r?.id;

  for (const r of primary || []) {

    const k = keyOf(r);

    if (!seen.has(k)) {

      seen.add(k);

      out.push(r);

      if (out.length >= limit) {

        return out;

  for (const r of secondary || []) {

    const k = keyOf(r);

    if (!seen.has(k)) {

      seen.add(k);

      out.push(r);

      if (out.length >= limit) {

        return out;

  return out;

export const SearchBrowsingHistoryDomainBoost = Object.freeze({

  matchDomains,

  searchByDomains,

  mergeDedupe,

});