Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
import { GlodaConstants } from "resource:///modules/gloda/GlodaConstants.sys.mjs";
/**
* How much time boost should a 'score point' amount to? The authoritative,
* incontrivertible answer, across all time and space, is a week.
* Note that gloda stores timestamps as PRTimes for no exceedingly good
* reason.
*/
var FUZZSCORE_TIMESTAMP_FACTOR = 1000 * 1000 * 60 * 60 * 24 * 7;
var RANK_USAGE = "glodaRank(matchinfo(messagesText), 1.0, 2.0, 2.0, 1.5, 1.5)";
var DASCORE =
"(((" +
RANK_USAGE +
" + messages.notability) * " +
FUZZSCORE_TIMESTAMP_FACTOR +
") + messages.date)";
/**
* A new optimization decision we are making is that we do not want to carry
* around any data in our ephemeral tables that is not used for whittling the
* result set. The idea is that the btree page cache or OS cache is going to
* save us from the disk seeks and carrying around the extra data is just going
* to be CPU/memory churn that slows us down.
*
* Additionally, we try and avoid row lookups that would have their results
* discarded by the LIMIT. Because of limitations in FTS3 (which might
* be addressed in FTS4 by a feature request), we can't avoid the 'messages'
* lookup since that has the message's date and static notability but we can
* defer the 'messagesText' lookup.
*
* This is the access pattern we are after here:
* 1) Order the matches with minimized lookup and result storage costs.
* - The innermost MATCH does the doclist magic and provides us with
* matchinfo() support which does not require content row retrieval
* from messagesText. Unfortunately, this is not enough to whittle anything
* because we still need static interestingness, so...
* - Based on the match we retrieve the date and notability for that row from
* 'messages' using this in conjunction with matchinfo() to provide a score
* that we can then use to LIMIT our results.
* 2) We reissue the MATCH query so that we will be able to use offsets(), but
* we intersect the results of this MATCH against our LIMITed results from
* step 1.
* - We use 'docid IN (phase 1 query)' to accomplish this because it results in
* efficient lookup. If we just use a join, we get O(mn) performance because
* a cartesian join ends up being performed where either we end up performing
* the fulltext query M times and table scan intersect with the results from
* phase 1 or we do the fulltext once but traverse the entire result set from
* phase 1 N times.
* - We believe that the re-execution of the MATCH query should have no disk
* costs because it should still be cached by SQLite or the OS. In the case
* where memory is so constrained this is not true our behavior is still
* probably preferable than the old way because that would have caused lots
* of swapping.
* - This part of the query otherwise resembles the basic gloda query but with
* the inclusion of the offsets() invocation. The messages table lookup
* should not involve any disk traffic because the pages should still be
* cached (SQLite or OS) from phase 1. The messagesText lookup is new, and
* this is the major disk-seek reduction optimization we are making. (Since
* we avoid this lookup for all of the documents that were excluded by the
* LIMIT.) Since offsets() also needs to retrieve the row from messagesText
* there is a nice synergy there.
*/
var NUEVO_FULLTEXT_SQL =
"SELECT messages.*, messagesText.*, offsets(messagesText) AS osets " +
"FROM messagesText, messages " +
"WHERE" +
" messagesText MATCH ?1 " +
" AND messagesText.docid IN (" +
"SELECT docid " +
"FROM messagesText JOIN messages ON messagesText.docid = messages.id " +
"WHERE messagesText MATCH ?1 " +
"ORDER BY " +
DASCORE +
" DESC " +
"LIMIT ?2" +
" )" +
" AND messages.id = messagesText.docid " +
" AND +messages.deleted = 0" +
" AND +messages.folderID IS NOT NULL" +
" AND +messages.messageKey IS NOT NULL";
function identityFunc(x) {
return x;
}
function oneLessMaxZero(x) {
if (x <= 1) {
return 0;
}
return x - 1;
}
function reduceSum(accum, curValue) {
return accum + curValue;
}
/*
* Columns are: body, subject, attachment names, author, recipients
*/
/**
* Scores if all search terms match in a column. We bias against author
* slightly and recipient a bit more in this case because a search that
* entirely matches just on a person should give a mention of that person
* in the subject or attachment a fighting chance.
* Keep in mind that because of our indexing in the face of address book
* contacts (namely, we index the name used in the e-mail as well as the
* display name on the address book card associated with the e-mail address)
* a contact is going to bias towards matching multiple times.
*/
var COLUMN_ALL_MATCH_SCORES = [4, 20, 20, 16, 12];
/**
* Score for each distinct term that matches in the column. This is capped
* by COLUMN_ALL_SCORES.
*/
var COLUMN_PARTIAL_PER_MATCH_SCORES = [1, 4, 4, 4, 3];
/**
* If a term matches multiple times, what is the marginal score for each
* additional match. We count the total number of matches beyond the
* first match for each term. In other words, if we have 3 terms which
* matched 5, 3, and 0 times, then the total from our perspective is
* (5 - 1) + (3 - 1) + 0 = 4 + 2 + 0 = 6. We take the minimum of that value
* and the value in COLUMN_MULTIPLE_MATCH_LIMIT and multiply by the value in
* COLUMN_MULTIPLE_MATCH_SCORES.
*/
var COLUMN_MULTIPLE_MATCH_SCORES = [1, 0, 0, 0, 0];
var COLUMN_MULTIPLE_MATCH_LIMIT = [10, 0, 0, 0, 0];
/**
* Score the message on its offsets (from stashedColumns).
*/
function scoreOffsets(aMessage, aContext) {
let score = 0;
const termTemplate = aContext.terms.map(_ => 0);
// for each column, a list of the incidence of each term
const columnTermIncidence = [
termTemplate.concat(),
termTemplate.concat(),
termTemplate.concat(),
termTemplate.concat(),
termTemplate.concat(),
];
// we need a friendlyParseInt because otherwise the radix stuff happens
// because of the extra arguments map parses. curse you, map!
const offsetNums = aContext.stashedColumns[aMessage.id][0]
.split(" ")
.map(x => parseInt(x));
for (let i = 0; i < offsetNums.length; i += 4) {
const columnIndex = offsetNums[i];
const termIndex = offsetNums[i + 1];
columnTermIncidence[columnIndex][termIndex]++;
}
for (let iColumn = 0; iColumn < COLUMN_ALL_MATCH_SCORES.length; iColumn++) {
const termIncidence = columnTermIncidence[iColumn];
if (termIncidence.every(identityFunc)) {
// Bestow all match credit.
score += COLUMN_ALL_MATCH_SCORES[iColumn];
} else if (termIncidence.some(identityFunc)) {
// Bestow partial match credit.
score += Math.min(
COLUMN_ALL_MATCH_SCORES[iColumn],
COLUMN_PARTIAL_PER_MATCH_SCORES[iColumn] *
termIncidence.filter(identityFunc).length
);
}
// Bestow multiple match credit.
score +=
Math.min(
termIncidence.map(oneLessMaxZero).reduce(reduceSum, 0),
COLUMN_MULTIPLE_MATCH_LIMIT[iColumn]
) * COLUMN_MULTIPLE_MATCH_SCORES[iColumn];
}
return score;
}
/**
* The searcher basically looks like a query, but is specialized for fulltext
* search against messages. Most of the explicit specialization involves
* crafting a SQL query that attempts to order the matches by likelihood that
* the user was looking for it. This is based on full-text matches combined
* with an explicit (generic) interest score value placed on the message at
* indexing time. This is followed by using the more generic gloda scoring
* mechanism to explicitly score the messages given the search context in
* addition to the more generic score adjusting rules.
*/
export function GlodaMsgSearcher(aListener, aSearchString, aAndTerms) {
this.listener = aListener;
this.searchString = aSearchString;
this.fulltextTerms = this.parseSearchString(aSearchString);
this.andTerms = aAndTerms != null ? aAndTerms : true;
this.query = null;
this.collection = null;
this.scores = null;
}
GlodaMsgSearcher.prototype = {
/**
* Number of messages to retrieve initially.
*/
get retrievalLimit() {
return Services.prefs.getIntPref(
"mailnews.database.global.search.msg.limit"
);
},
/**
* Parse the string into terms/phrases by finding matching double-quotes.
*/
parseSearchString(aSearchString) {
aSearchString = aSearchString.trim();
const terms = [];
/*
* Add the term as long as the trim on the way in didn't obliterate it.
*
* In the future this might have other helper logic; it did once before.
*/
function addTerm(aTerm) {
if (aTerm) {
terms.push(aTerm);
}
}
while (aSearchString) {
if (aSearchString.startsWith('"')) {
const endIndex = aSearchString.indexOf(aSearchString[0], 1);
// eat the quote if it has no friend
if (endIndex == -1) {
aSearchString = aSearchString.substring(1);
continue;
}
addTerm(aSearchString.substring(1, endIndex).trim());
aSearchString = aSearchString.substring(endIndex + 1);
continue;
}
const spaceIndex = aSearchString.indexOf(" ");
if (spaceIndex == -1) {
addTerm(aSearchString);
break;
}
addTerm(aSearchString.substring(0, spaceIndex));
aSearchString = aSearchString.substring(spaceIndex + 1);
}
return terms;
},
buildFulltextQuery() {
const query = Gloda.newQuery(GlodaConstants.NOUN_MESSAGE, {
noMagic: true,
explicitSQL: NUEVO_FULLTEXT_SQL,
limitClauseAlreadyIncluded: true,
// osets is 0-based column number 14 (volatile to column changes)
// save the offset column for extra analysis
stashColumns: [14],
});
let fulltextQueryString = "";
for (const [iTerm, term] of this.fulltextTerms.entries()) {
if (iTerm) {
fulltextQueryString += this.andTerms ? " " : " OR ";
}
// Put our term in quotes. This is needed for the tokenizer to be able
// to do useful things. The exception is people clever enough to use
// NEAR.
if (/^NEAR(\/\d+)?$/.test(term)) {
fulltextQueryString += term;
} else if (term.length == 1 && term.charCodeAt(0) >= 0x2000) {
// This is a single-character CJK search query, so add a wildcard.
// Our tokenizer treats anything at/above 0x2000 as CJK for now.
fulltextQueryString += term + "*";
} else if (
(term.length == 2 &&
term.charCodeAt(0) >= 0x2000 &&
term.charCodeAt(1) >= 0x2000) ||
term.length >= 3
) {
fulltextQueryString += '"' + term + '"';
}
}
query.fulltextMatches(fulltextQueryString);
query.limit(this.retrievalLimit);
return query;
},
getCollection(aListenerOverride, aData) {
if (aListenerOverride) {
this.listener = aListenerOverride;
}
this.query = this.buildFulltextQuery();
this.collection = this.query.getCollection(this, aData);
this.completed = false;
return this.collection;
},
sortBy: "-dascore",
onItemsAdded(aItems, aCollection) {
const newScores = Gloda.scoreNounItems(
aItems,
{
terms: this.fulltextTerms,
stashedColumns: aCollection.stashedColumns,
},
[scoreOffsets]
);
if (this.scores) {
this.scores = this.scores.concat(newScores);
} else {
this.scores = newScores;
}
if (this.listener) {
this.listener.onItemsAdded(aItems, aCollection);
}
},
onItemsModified(aItems, aCollection) {
if (this.listener) {
this.listener.onItemsModified(aItems, aCollection);
}
},
onItemsRemoved(aItems, aCollection) {
if (this.listener) {
this.listener.onItemsRemoved(aItems, aCollection);
}
},
onQueryCompleted(aCollection) {
this.completed = true;
if (this.listener) {
this.listener.onQueryCompleted(aCollection);
}
},
};