Source code
Revision control
Copy as Markdown
Other Tools
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
// @ts-check
/**
* @import { PipelineOptions } from "chrome://global/content/ml/EngineProcess.sys.mjs"
* @import { BackendError } from "./Pipeline.mjs"
* @import { MLEngineWorker } from "../MLEngine.worker.mjs"
* @import { EmbeddingDType, EmbeddingRequest, EmbeddingResponse, PreTrainedTokenizer } from "./StaticEmbeddingsPipeline.d.ts"
*/
/**
* @typedef {object} Lazy
* @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").createFileUrl} createFileUrl
* @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").parseNpy} parseNpy
* @property {typeof import("chrome://global/content/ml/OPFS.sys.mjs").OPFS} OPFS
* @property {typeof import("chrome://global/content/ml/backends/ONNXPipeline.mjs").importTransformers} importTransformers
* @property {typeof import("chrome://global/content/ml/EngineProcess.sys.mjs").QuantizationLevel} QuantizationLevel
*/
/** @type {Lazy} */
const lazy = /** @type {any} */ ({});
ChromeUtils.defineESModuleGetters(
lazy,
{
createFileUrl: "chrome://global/content/ml/Utils.sys.mjs",
parseNpy: "chrome://global/content/ml/Utils.sys.mjs",
OPFS: "chrome://global/content/ml/OPFS.sys.mjs",
importTransformers: "chrome://global/content/ml/backends/ONNXPipeline.mjs",
QuantizationLevel: "chrome://global/content/ml/EngineProcess.sys.mjs",
},
{ global: "current" }
);
/**
* Mock out a response object for tests. This should have the same type as @see {Response}
*/
class MockedResponse {
/** @type {any} */
#value;
/**
* @param {any} value
*/
constructor(value) {
this.#value = value;
}
/**
* @returns {ReturnType<Response["json"]>}
*/
json() {
return this.#value;
}
/**
* @returns {ReturnType<Response["arrayBuffer"]>}
*/
arrayBuffer() {
return this.#value;
}
}
/**
* Embeddings are typically generated through running text through a BERT-like model
* that is an encoder-only transformer. However, this is expensive and slow. Static
* embeddings allow for a cheaper way to generate an embedding by just averaging the
* values of each token's embedding vector. This involves a simple lookup per token, and
* then some vector math. These embeddings are often good enough for looking up
* semantically similar values.
*/
export class StaticEmbeddingsPipeline {
/** @type {PreTrainedTokenizer} */
#tokenizer;
/**
* The embedding dimensions size, e.g. 128, 256, 512
*
* @type {number}
*/
#dimensions;
/** @type {number | null} */
#initializeStart;
/**
* Get a native JS double out of the backing data array.
*
* @type {(index: number) => number}
*/
#getFloat;
/**
* @param {PreTrainedTokenizer} tokenizer
* @param {ArrayBuffer} npyData
* @param {EmbeddingDType} dtype
* @param {number} dimensions
* @param {number} initializeStart
*/
constructor(tokenizer, npyData, dtype, dimensions, initializeStart) {
this.#tokenizer = tokenizer;
this.#dimensions = dimensions;
this.#initializeStart = initializeStart;
const {
data: embeddings,
shape: [vocabSize, dimActual],
} = lazy.parseNpy(npyData);
if (dimActual != this.#dimensions) {
throw new Error(
`The dimensions requested (${this.#dimensions}) and the dimensions received (${dimActual}) did not match`
);
}
if (tokenizer.model.vocab.length != vocabSize) {
throw new Error(
`The tokenizer vocab size (${this.#dimensions}) did not match the data vocab size (${vocabSize})`
);
}
switch (dtype) {
case lazy.QuantizationLevel.FP32:
case lazy.QuantizationLevel.FP16:
// No processing is needed.
this.#getFloat = index => this.embeddings[index];
break;
case lazy.QuantizationLevel.FP8_E5M2:
this.#getFloat = this.#getFp8_E5M2;
break;
case lazy.QuantizationLevel.FP8_E4M3:
this.#getFloat = this.#getFp8_E4M3;
break;
default:
throw new Error("Unsupported dtype: " + dtype);
}
/** @type {ArrayBufferLike} */
this.embeddings = embeddings;
}
/**
* @param {MLEngineWorker} worker
* @param {null} _wasm
* @param {PipelineOptions} pipelineOptions
* @param {(error: any) => BackendError} _createError
*/
static async initialize(worker, _wasm, pipelineOptions, _createError) {
let initializeStart = ChromeUtils.now();
const {
backend,
modelHubRootUrl,
modelHubUrlTemplate,
modelId,
modelRevision,
staticEmbeddingsOptions,
} = pipelineOptions;
// These are the options that are specific to this engine.
const { subfolder, dtype, dimensions, compression, mockedValues } =
staticEmbeddingsOptions;
const extension = compression ? ".zst" : "";
const files = [
`${subfolder}/tokenizer.json${extension}`,
`${subfolder}/${dtype}.d${dimensions}.npy${extension}`,
];
/**
* @param {string} fileName
* @returns {Promise<Response | MockedResponse>}
*/
async function getResponse(fileName) {
const url = lazy.createFileUrl({
file: fileName,
model: modelId,
revision: modelRevision,
urlTemplate: modelHubUrlTemplate,
rootUrl: modelHubRootUrl,
});
if (mockedValues) {
const mockedValue = mockedValues[url];
if (!mockedValue) {
throw new Error(
"Could not find mocked value for requested url: " + url
);
}
if (url.endsWith(`.json${extension}`)) {
return new MockedResponse(mockedValue);
}
return new MockedResponse(new Uint8Array(mockedValue).buffer);
}
const modelFile = await worker.getModelFile({ url });
const filePath = modelFile.ok[2];
const fileHandle = await lazy.OPFS.getFileHandle(filePath);
const file = await fileHandle.getFile();
let stream = file.stream();
if (compression) {
const decompressionStream = new DecompressionStream("zstd");
stream = stream.pipeThrough(decompressionStream);
}
return new Response(stream);
}
const [tokenizerJsonResponse, npyDataResponse] = await Promise.all(
files.map(getResponse)
);
const npyData = await npyDataResponse.arrayBuffer();
const tokenizerJson = await tokenizerJsonResponse.json();
let assetsLoad = ChromeUtils.now();
ChromeUtils.addProfilerMarker(
"StaticEmbeddingsPipeline",
initializeStart,
"Assets load"
);
const { PreTrainedTokenizer } = await lazy.importTransformers(backend);
const tokenizer = new PreTrainedTokenizer(tokenizerJson, {});
ChromeUtils.addProfilerMarker(
"StaticEmbeddingsPipeline",
assetsLoad,
"Tokenizer load"
);
return new StaticEmbeddingsPipeline(
tokenizer,
npyData,
dtype,
dimensions,
initializeStart
);
}
/**
* @param {number} index
*/
#getFp8_E5M2 = index => {
const byte = this.embeddings[index];
// Do some bit manipulation to extract the sign (S), the exponent (E), and the
// mantissa (M)
// This is format: | S E E E | E E M M |
// To do the manipulation, shift the bits to the right (>>) and mask off the relevant
// bits with an & operation.
const sign = (byte >> 7) & 0b0000_0001;
const exponent = (byte >> 2) & 0b0001_1111;
const mantissa = byte & 0b0000_0011;
const bias = 15;
if (exponent === 0) {
if (mantissa === 0) {
// Zero
return sign ? -0 : 0;
}
// Subnormal: exponent = 1 - bias, no implicit leading 1
const frac = mantissa / 4; // 2 mantissa bits → divide by 2^2
const value = frac * Math.pow(2, 1 - bias);
return sign ? -value : value;
} else if (exponent === 0x1f) {
if (mantissa === 0) {
return sign ? -Infinity : Infinity;
}
return NaN;
}
// Normalized
const frac = 1 + mantissa / 4;
const value = frac * Math.pow(2, exponent - bias);
return sign ? -value : value;
};
/**
* @param {number} index
*/
#getFp8_E4M3 = index => {
const byte = this.embeddings[index];
// Do some bit manipulation to extract the sign (S), the exponent (E), and the
// mantissa (M)
// This is format: | S E E E | E M M M |
// To do the manipulation, shift the bits to the right (>>) and mask off the relevant
// bits with an & operation.
const sign = (byte >> 7) & 0b0000_0001;
const exponent = (byte >> 3) & 0b0000_1111;
const mantissa = byte & 0b0000_0111;
const bias = 7;
if (exponent === 0) {
if (mantissa === 0) {
return sign ? -0 : 0;
}
// Subnormal
const frac = mantissa / 8; // 3 mantissa bits → divide by 2^3
const value = frac * Math.pow(2, 1 - bias);
return sign ? -value : value;
} else if (exponent === 0xf) {
if (mantissa === 0) {
return sign ? -Infinity : Infinity;
}
return NaN;
}
// Normalized
const frac = 1 + mantissa / 8;
const value = frac * Math.pow(2, exponent - bias);
return sign ? -value : value;
};
/**
* @param {EmbeddingRequest} request
* @param {number} _requestId
* @param {null} _engineRunOptions
* @returns {EmbeddingResponse}
*/
run(request, _requestId, _engineRunOptions) {
if (request.options.pooling != "mean") {
throw new Error(
`Only "mean" pooling is currently supported, please add support "${request.options.pooling}" here.`
);
}
let tokenCount = 0;
const sequenceCount = request.args.length;
let beforeResponse = ChromeUtils.now();
const response = {
metrics: [],
output: request.args.map(text => {
// Always do the vector math in f32 space, even if the underlying precision
// is lower.
const embedding = new Float32Array(this.#dimensions);
/** @type {number[]} */
const tokenIds = this.#tokenizer.encode(text);
tokenCount += tokenIds.length;
// Sum up the embeddings.
for (const tokenId of tokenIds) {
for (let i = 0; i < this.#dimensions; i++) {
// Inflate the double into a JavaScript double, then add it.
embedding[i] += this.#getFloat(tokenId * this.#dimensions + i);
}
}
if (request.options.normalize) {
// Compute the average by dividing by the tokens provided.
// Also compute the sum of the squares while we're here.
let sumSquares = 0;
for (let i = 0; i < this.#dimensions; i++) {
const n = embedding[i] / tokenIds.length;
embedding[i] = n;
sumSquares += n * n;
}
// Apply the normalization.
const magnitude = Math.sqrt(sumSquares);
if (magnitude != 0) {
for (let i = 0; i < this.#dimensions; i++) {
embedding[i] = embedding[i] / magnitude;
}
}
} else {
// Only compute the average by dividing by the tokens provided.
for (let i = 0; i < this.#dimensions; i++) {
embedding[i] = embedding[i] / tokenIds.length;
}
}
return embedding;
}),
};
ChromeUtils.addProfilerMarker(
"StaticEmbeddingsPipeline",
beforeResponse,
`Processed ${sequenceCount} sequences with ${tokenCount} tokens.`
);
if (this.#initializeStart) {
ChromeUtils.addProfilerMarker(
"StaticEmbeddingsPipeline",
this.#initializeStart,
"Time to first response"
);
this.#initializeStart = null;
}
return response;
}
}