GlodaMsgSearcher.sys.mjs

comm-central/mailnews/db/gloda/modules/GlodaMsgSearcher.sys.mjs (file symbol)

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import { Gloda } from "resource:///modules/gloda/GlodaPublic.sys.mjs";

import { GlodaConstants } from "resource:///modules/gloda/GlodaConstants.sys.mjs";

/**

 * How much time boost should a 'score point' amount to?  The authoritative,

 *  incontrivertible answer, across all time and space, is a week.

 *  Note that gloda stores timestamps as PRTimes for no exceedingly good

 *  reason.

*/

var FUZZSCORE_TIMESTAMP_FACTOR = 1000 * 1000 * 60 * 60 * 24 * 7;

var RANK_USAGE = "glodaRank(matchinfo(messagesText), 1.0, 2.0, 2.0, 1.5, 1.5)";

var DASCORE =

  "(((" +

  RANK_USAGE +

  " + messages.notability) * " +

  FUZZSCORE_TIMESTAMP_FACTOR +

  ") + messages.date)";

/**

 * A new optimization decision we are making is that we do not want to carry

 *  around any data in our ephemeral tables that is not used for whittling the

 *  result set.  The idea is that the btree page cache or OS cache is going to

 *  save us from the disk seeks and carrying around the extra data is just going

 *  to be CPU/memory churn that slows us down.

 * Additionally, we try and avoid row lookups that would have their results

 *  discarded by the LIMIT.  Because of limitations in FTS3 (which might

 *  be addressed in FTS4 by a feature request), we can't avoid the 'messages'

 *  lookup since that has the message's date and static notability but we can

 *  defer the 'messagesText' lookup.

 * This is the access pattern we are after here:

 * 1) Order the matches with minimized lookup and result storage costs.

 * - The innermost MATCH does the doclist magic and provides us with

 *    matchinfo() support which does not require content row retrieval

 *    from messagesText.  Unfortunately, this is not enough to whittle anything

 *    because we still need static interestingness, so...

 * - Based on the match we retrieve the date and notability for that row from

 *    'messages' using this in conjunction with matchinfo() to provide a score

 *    that we can then use to LIMIT our results.

 * 2) We reissue the MATCH query so that we will be able to use offsets(), but

 *    we intersect the results of this MATCH against our LIMITed results from

 *    step 1.

 * - We use 'docid IN (phase 1 query)' to accomplish this because it results in

 *    efficient lookup.  If we just use a join, we get O(mn) performance because

 *    a cartesian join ends up being performed where either we end up performing

 *    the fulltext query M times and table scan intersect with the results from

 *    phase 1 or we do the fulltext once but traverse the entire result set from

 *    phase 1 N times.

 * - We believe that the re-execution of the MATCH query should have no disk

 *    costs because it should still be cached by SQLite or the OS.  In the case

 *    where memory is so constrained this is not true our behavior is still

 *    probably preferable than the old way because that would have caused lots

 *    of swapping.

 * - This part of the query otherwise resembles the basic gloda query but with

 *    the inclusion of the offsets() invocation.  The messages table lookup

 *    should not involve any disk traffic because the pages should still be

 *    cached (SQLite or OS) from phase 1.  The messagesText lookup is new, and

 *    this is the major disk-seek reduction optimization we are making.  (Since

 *    we avoid this lookup for all of the documents that were excluded by the

 *    LIMIT.)  Since offsets() also needs to retrieve the row from messagesText

 *    there is a nice synergy there.

*/

var NUEVO_FULLTEXT_SQL =

  "SELECT messages.*, messagesText.*, offsets(messagesText) AS osets " +

  "FROM messagesText, messages " +

  "WHERE" +

  " messagesText MATCH ?1 " +

  " AND messagesText.docid IN (" +

  "SELECT docid " +

  "FROM messagesText JOIN messages ON messagesText.docid = messages.id " +

  "WHERE messagesText MATCH ?1 " +

  "ORDER BY " +

  DASCORE +

  " DESC " +

  "LIMIT ?2" +

  " )" +

  " AND messages.id = messagesText.docid " +

  " AND +messages.deleted = 0" +

  " AND +messages.folderID IS NOT NULL" +

  " AND +messages.messageKey IS NOT NULL";

function identityFunc(x) {

  return x;

function oneLessMaxZero(x) {

  if (x <= 1) {

    return 0;

  return x - 1;

function reduceSum(accum, curValue) {

  return accum + curValue;

/*

 * Columns are: body, subject, attachment names, author, recipients

*/

/**

 * Scores if all search terms match in a column.  We bias against author

 *  slightly and recipient a bit more in this case because a search that

 *  entirely matches just on a person should give a mention of that person

 *  in the subject or attachment a fighting chance.

 * Keep in mind that because of our indexing in the face of address book

 *  contacts (namely, we index the name used in the e-mail as well as the

 *  display name on the address book card associated with the e-mail address)

 *  a contact is going to bias towards matching multiple times.

*/

var COLUMN_ALL_MATCH_SCORES = [4, 20, 20, 16, 12];

/**

 * Score for each distinct term that matches in the column.  This is capped

 *  by COLUMN_ALL_SCORES.

*/

var COLUMN_PARTIAL_PER_MATCH_SCORES = [1, 4, 4, 4, 3];

/**

 * If a term matches multiple times, what is the marginal score for each

 *  additional match.  We count the total number of matches beyond the

 *  first match for each term.  In other words, if we have 3 terms which

 *  matched 5, 3, and 0 times, then the total from our perspective is

 *  (5 - 1) + (3 - 1) + 0 = 4 + 2 + 0 = 6.  We take the minimum of that value

 *  and the value in COLUMN_MULTIPLE_MATCH_LIMIT and multiply by the value in

 *  COLUMN_MULTIPLE_MATCH_SCORES.

*/

var COLUMN_MULTIPLE_MATCH_SCORES = [1, 0, 0, 0, 0];

var COLUMN_MULTIPLE_MATCH_LIMIT = [10, 0, 0, 0, 0];

/**

 * Score the message on its offsets (from stashedColumns).

*/

function scoreOffsets(aMessage, aContext) {

  let score = 0;

  const termTemplate = aContext.terms.map(_ => 0);

  // for each column, a list of the incidence of each term

  const columnTermIncidence = [

    termTemplate.concat(),

    termTemplate.concat(),

    termTemplate.concat(),

    termTemplate.concat(),

    termTemplate.concat(),

];

  // we need a friendlyParseInt because otherwise the radix stuff happens

  //  because of the extra arguments map parses.  curse you, map!

  const offsetNums = aContext.stashedColumns[aMessage.id][0]

    .split(" ")

    .map(x => parseInt(x));

  for (let i = 0; i < offsetNums.length; i += 4) {

    const columnIndex = offsetNums[i];

    const termIndex = offsetNums[i + 1];

    columnTermIncidence[columnIndex][termIndex]++;

  for (let iColumn = 0; iColumn < COLUMN_ALL_MATCH_SCORES.length; iColumn++) {

    const termIncidence = columnTermIncidence[iColumn];

    if (termIncidence.every(identityFunc)) {

      // Bestow all match credit.

      score += COLUMN_ALL_MATCH_SCORES[iColumn];

    } else if (termIncidence.some(identityFunc)) {

      // Bestow partial match credit.

      score += Math.min(

        COLUMN_ALL_MATCH_SCORES[iColumn],

        COLUMN_PARTIAL_PER_MATCH_SCORES[iColumn] *

          termIncidence.filter(identityFunc).length

);

    // Bestow multiple match credit.

    score +=

      Math.min(

        termIncidence.map(oneLessMaxZero).reduce(reduceSum, 0),

        COLUMN_MULTIPLE_MATCH_LIMIT[iColumn]

      ) * COLUMN_MULTIPLE_MATCH_SCORES[iColumn];

  return score;

/**

 * The searcher basically looks like a query, but is specialized for fulltext

 *  search against messages.  Most of the explicit specialization involves

 *  crafting a SQL query that attempts to order the matches by likelihood that

 *  the user was looking for it.  This is based on full-text matches combined

 *  with an explicit (generic) interest score value placed on the message at

 *  indexing time.  This is followed by using the more generic gloda scoring

 *  mechanism to explicitly score the messages given the search context in

 *  addition to the more generic score adjusting rules.

*/

export function GlodaMsgSearcher(aListener, aSearchString, aAndTerms) {

  this.listener = aListener;

  this.searchString = aSearchString;

  this.fulltextTerms = this.parseSearchString(aSearchString);

  this.andTerms = aAndTerms != null ? aAndTerms : true;

  this.query = null;

  this.collection = null;

  this.scores = null;

GlodaMsgSearcher.prototype = {

/**

   * Number of messages to retrieve initially.

*/

  get retrievalLimit() {

    return Services.prefs.getIntPref(

      "mailnews.database.global.search.msg.limit"

);

},

/**

   * Parse the string into terms/phrases by finding matching double-quotes.

*/

  parseSearchString(aSearchString) {

    aSearchString = aSearchString.trim();

    const terms = [];

/*

     * Add the term as long as the trim on the way in didn't obliterate it.

     * In the future this might have other helper logic; it did once before.

*/

    function addTerm(aTerm) {

      if (aTerm) {

        terms.push(aTerm);

    while (aSearchString) {

      if (aSearchString.startsWith('"')) {

        const endIndex = aSearchString.indexOf(aSearchString[0], 1);

        // eat the quote if it has no friend

        if (endIndex == -1) {

          aSearchString = aSearchString.substring(1);

          continue;

        addTerm(aSearchString.substring(1, endIndex).trim());

        aSearchString = aSearchString.substring(endIndex + 1);

        continue;

      const spaceIndex = aSearchString.indexOf(" ");

      if (spaceIndex == -1) {

        addTerm(aSearchString);

        break;

      addTerm(aSearchString.substring(0, spaceIndex));

      aSearchString = aSearchString.substring(spaceIndex + 1);

    return terms;

},

  buildFulltextQuery() {

    const query = Gloda.newQuery(GlodaConstants.NOUN_MESSAGE, {

      noMagic: true,

      explicitSQL: NUEVO_FULLTEXT_SQL,

      limitClauseAlreadyIncluded: true,

      // osets is 0-based column number 14 (volatile to column changes)

      // save the offset column for extra analysis

      stashColumns: [14],

});

    let fulltextQueryString = "";

    for (const [iTerm, term] of this.fulltextTerms.entries()) {

      if (iTerm) {

        fulltextQueryString += this.andTerms ? " " : " OR ";

      // Put our term in quotes.  This is needed for the tokenizer to be able

      //  to do useful things.  The exception is people clever enough to use

      //  NEAR.

      if (/^NEAR(\/\d+)?$/.test(term)) {

        fulltextQueryString += term;

      } else if (term.length == 1 && term.charCodeAt(0) >= 0x2000) {

        // This is a single-character CJK search query, so add a wildcard.

        // Our tokenizer treats anything at/above 0x2000 as CJK for now.

        fulltextQueryString += term + "*";

      } else if (

        (term.length == 2 &&

          term.charCodeAt(0) >= 0x2000 &&

          term.charCodeAt(1) >= 0x2000) ||

        term.length >= 3

) {

        fulltextQueryString += '"' + term + '"';

    query.fulltextMatches(fulltextQueryString);

    query.limit(this.retrievalLimit);

    return query;

},

  getCollection(aListenerOverride, aData) {

    if (aListenerOverride) {

      this.listener = aListenerOverride;

    this.query = this.buildFulltextQuery();

    this.collection = this.query.getCollection(this, aData);

    this.completed = false;

    return this.collection;

},

  sortBy: "-dascore",

  onItemsAdded(aItems, aCollection) {

    const newScores = Gloda.scoreNounItems(

      aItems,

        terms: this.fulltextTerms,

        stashedColumns: aCollection.stashedColumns,

},

      [scoreOffsets]

);

    if (this.scores) {

      this.scores = this.scores.concat(newScores);

    } else {

      this.scores = newScores;

    if (this.listener) {

      this.listener.onItemsAdded(aItems, aCollection);

},

  onItemsModified(aItems, aCollection) {

    if (this.listener) {

      this.listener.onItemsModified(aItems, aCollection);

},

  onItemsRemoved(aItems, aCollection) {

    if (this.listener) {

      this.listener.onItemsRemoved(aItems, aCollection);

},

  onQueryCompleted(aCollection) {

    this.completed = true;

    if (this.listener) {

      this.listener.onQueryCompleted(aCollection);

},

};