StaticEmbeddingsPipeline.mjs

firefox-main/toolkit/components/ml/content/backends/StaticEmbeddingsPipeline.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Machine Learning

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// @ts-check

/**

 * @import { PipelineOptions } from "chrome://global/content/ml/EngineProcess.sys.mjs"

 * @import { BackendError } from "./Pipeline.mjs"

 * @import { MLEngineWorker } from "../MLEngine.worker.mjs"

 * @import { EmbeddingDType, EmbeddingRequest, EmbeddingResponse, PreTrainedTokenizer } from "./StaticEmbeddingsPipeline.d.ts"

*/

/**

 * @typedef {object} Lazy

 * @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").createFileUrl} createFileUrl

 * @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").parseNpy} parseNpy

 * @property {typeof import("chrome://global/content/ml/OPFS.sys.mjs").OPFS} OPFS

 * @property {typeof import("chrome://global/content/ml/backends/ONNXPipeline.mjs").importTransformers} importTransformers

 * @property {typeof import("chrome://global/content/ml/EngineProcess.sys.mjs").QuantizationLevel} QuantizationLevel

*/

/** @type {Lazy} */

const lazy = /** @type {any} */ ({});

ChromeUtils.defineESModuleGetters(

  lazy,

    createFileUrl: "chrome://global/content/ml/Utils.sys.mjs",

    parseNpy: "chrome://global/content/ml/Utils.sys.mjs",

    OPFS: "chrome://global/content/ml/OPFS.sys.mjs",

    importTransformers: "chrome://global/content/ml/backends/ONNXPipeline.mjs",

    QuantizationLevel: "chrome://global/content/ml/EngineProcess.sys.mjs",

},

  { global: "current" }

);

/**

 * Mock out a response object for tests. This should have the same type as @see {Response}

*/

class MockedResponse {

  /** @type {any} */

  #value;

/**

   * @param {any} value

*/

  constructor(value) {

    this.#value = value;

/**

   * @returns {ReturnType<Response["json"]>}

*/

  json() {

    return this.#value;

/**

   * @returns {ReturnType<Response["arrayBuffer"]>}

*/

  arrayBuffer() {

    return this.#value;

/**

 * Embeddings are typically generated through running text through a BERT-like model

 * that is an encoder-only transformer. However, this is expensive and slow. Static

 * embeddings allow for a cheaper way to generate an embedding by just averaging the

 * values of each token's embedding vector. This involves a simple lookup per token, and

 * then some vector math. These embeddings are often good enough for looking up

 * semantically similar values.

*/

export class StaticEmbeddingsPipeline {

  /** @type {PreTrainedTokenizer} */

  #tokenizer;

/**

   * The embedding dimensions size, e.g. 128, 256, 512

   * @type {number}

*/

  #dimensions;

  /** @type {number | null} */

  #initializeStart;

/**

   * Get a native JS double out of the backing data array.

   * @type {(index: number) => number}

*/

  #getFloat;

/**

   * @param {PreTrainedTokenizer} tokenizer

   * @param {ArrayBuffer} npyData

   * @param {EmbeddingDType} dtype

   * @param {number} dimensions

   * @param {number} initializeStart

*/

  constructor(tokenizer, npyData, dtype, dimensions, initializeStart) {

    this.#tokenizer = tokenizer;

    this.#dimensions = dimensions;

    this.#initializeStart = initializeStart;

    const {

      data: embeddings,

      shape: [vocabSize, dimActual],

    } = lazy.parseNpy(npyData);

    if (dimActual != this.#dimensions) {

      throw new Error(

        `The dimensions requested (${this.#dimensions}) and the dimensions received (${dimActual}) did not match`

);

    if (tokenizer.model.vocab.length != vocabSize) {

      throw new Error(

        `The tokenizer vocab size (${this.#dimensions}) did not match the data vocab size (${vocabSize})`

);

    switch (dtype) {

      case lazy.QuantizationLevel.FP32:

      case lazy.QuantizationLevel.FP16:

        // No processing is needed.

        this.#getFloat = index => this.embeddings[index];

        break;

      case lazy.QuantizationLevel.FP8_E5M2:

        this.#getFloat = this.#getFp8_E5M2;

        break;

      case lazy.QuantizationLevel.FP8_E4M3:

        this.#getFloat = this.#getFp8_E4M3;

        break;

      default:

        throw new Error("Unsupported dtype: " + dtype);

    /** @type {ArrayBufferLike} */

    this.embeddings = embeddings;

/**

   * @param {MLEngineWorker} worker

   * @param {null} _wasm

   * @param {PipelineOptions} pipelineOptions

   * @param {(error: any) => BackendError} _createError

*/

  static async initialize(worker, _wasm, pipelineOptions, _createError) {

    let initializeStart = ChromeUtils.now();

    const {

      backend,

      modelHubRootUrl,

      modelHubUrlTemplate,

      modelId,

      modelRevision,

      staticEmbeddingsOptions,

    } = pipelineOptions;

    // These are the options that are specific to this engine.

    const { subfolder, dtype, dimensions, compression, mockedValues } =

      staticEmbeddingsOptions;

    const extension = compression ? ".zst" : "";

    const files = [

      `${subfolder}/tokenizer.json${extension}`,

      `${subfolder}/${dtype}.d${dimensions}.npy${extension}`,

];

/**

     * @param {string} fileName

     * @returns {Promise<Response | MockedResponse>}

*/

    async function getResponse(fileName) {

      const url = lazy.createFileUrl({

        file: fileName,

        model: modelId,

        revision: modelRevision,

        urlTemplate: modelHubUrlTemplate,

        rootUrl: modelHubRootUrl,

});

      if (mockedValues) {

        const mockedValue = mockedValues[url];

        if (!mockedValue) {

          throw new Error(

            "Could not find mocked value for requested url: " + url

);

        if (url.endsWith(`.json${extension}`)) {

          return new MockedResponse(mockedValue);

        return new MockedResponse(new Uint8Array(mockedValue).buffer);

      const modelFile = await worker.getModelFile({ url });

      const filePath = modelFile.ok[2];

      const fileHandle = await lazy.OPFS.getFileHandle(filePath);

      const file = await fileHandle.getFile();

      let stream = file.stream();

      if (compression) {

        const decompressionStream = new DecompressionStream("zstd");

        stream = stream.pipeThrough(decompressionStream);

      return new Response(stream);

    const [tokenizerJsonResponse, npyDataResponse] = await Promise.all(

      files.map(getResponse)

);

    const npyData = await npyDataResponse.arrayBuffer();

    const tokenizerJson = await tokenizerJsonResponse.json();

    let assetsLoad = ChromeUtils.now();

    ChromeUtils.addProfilerMarker(

      "StaticEmbeddingsPipeline",

      initializeStart,

      "Assets load"

);

    const { PreTrainedTokenizer } = await lazy.importTransformers(backend);

    const tokenizer = new PreTrainedTokenizer(tokenizerJson, {});

    ChromeUtils.addProfilerMarker(

      "StaticEmbeddingsPipeline",

      assetsLoad,

      "Tokenizer load"

);

    return new StaticEmbeddingsPipeline(

      tokenizer,

      npyData,

      dtype,

      dimensions,

      initializeStart

);

/**

   * @param {number} index

*/

  #getFp8_E5M2 = index => {

    const byte = this.embeddings[index];

    // Do some bit manipulation to extract the sign (S), the exponent (E), and the

    // mantissa (M)

    // This is format: | S E E E | E E M M |

    // To do the manipulation, shift the bits to the right (>>) and mask off the relevant

    // bits with an & operation.

    const sign = (byte >> 7) & 0b0000_0001;

    const exponent = (byte >> 2) & 0b0001_1111;

    const mantissa = byte & 0b0000_0011;

    const bias = 15;

    if (exponent === 0) {

      if (mantissa === 0) {

        // Zero

        return sign ? -0 : 0;

      // Subnormal: exponent = 1 - bias, no implicit leading 1

      const frac = mantissa / 4; // 2 mantissa bits → divide by 2^2

      const value = frac * Math.pow(2, 1 - bias);

      return sign ? -value : value;

    } else if (exponent === 0x1f) {

      if (mantissa === 0) {

        return sign ? -Infinity : Infinity;

      return NaN;

    // Normalized

    const frac = 1 + mantissa / 4;

    const value = frac * Math.pow(2, exponent - bias);

    return sign ? -value : value;

};

/**

   * @param {number} index

*/

  #getFp8_E4M3 = index => {

    const byte = this.embeddings[index];

    // Do some bit manipulation to extract the sign (S), the exponent (E), and the

    // mantissa (M)

    // This is format: | S E E E | E M M M |

    // To do the manipulation, shift the bits to the right (>>) and mask off the relevant

    // bits with an & operation.

    const sign = (byte >> 7) & 0b0000_0001;

    const exponent = (byte >> 3) & 0b0000_1111;

    const mantissa = byte & 0b0000_0111;

    const bias = 7;

    if (exponent === 0) {

      if (mantissa === 0) {

        return sign ? -0 : 0;

      // Subnormal

      const frac = mantissa / 8; // 3 mantissa bits → divide by 2^3

      const value = frac * Math.pow(2, 1 - bias);

      return sign ? -value : value;

    } else if (exponent === 0xf) {

      if (mantissa === 0) {

        return sign ? -Infinity : Infinity;

      return NaN;

    // Normalized

    const frac = 1 + mantissa / 8;

    const value = frac * Math.pow(2, exponent - bias);

    return sign ? -value : value;

};

/**

   * @param {EmbeddingRequest} request

   * @param {number} _requestId

   * @param {null} _engineRunOptions

   * @returns {EmbeddingResponse}

*/

  run(request, _requestId, _engineRunOptions) {

    if (request.options.pooling != "mean") {

      throw new Error(

        `Only "mean" pooling is currently supported, please add support "${request.options.pooling}" here.`

);

    let tokenCount = 0;

    const sequenceCount = request.args.length;

    let beforeResponse = ChromeUtils.now();

    const response = {

      metrics: [],

      output: request.args.map(text => {

        // Always do the vector math in f32 space, even if the underlying precision

        // is lower.

        const embedding = new Float32Array(this.#dimensions);

        /** @type {number[]} */

        const tokenIds = this.#tokenizer.encode(text);

        tokenCount += tokenIds.length;

        // Sum up the embeddings.

        for (const tokenId of tokenIds) {

          for (let i = 0; i < this.#dimensions; i++) {

            // Inflate the double into a JavaScript double, then add it.

            embedding[i] += this.#getFloat(tokenId * this.#dimensions + i);

        if (request.options.normalize) {

          // Compute the average by dividing by the tokens provided.

          // Also compute the sum of the squares while we're here.

          let sumSquares = 0;

          for (let i = 0; i < this.#dimensions; i++) {

            const n = embedding[i] / tokenIds.length;

            embedding[i] = n;

            sumSquares += n * n;

          // Apply the normalization.

          const magnitude = Math.sqrt(sumSquares);

          if (magnitude != 0) {

            for (let i = 0; i < this.#dimensions; i++) {

              embedding[i] = embedding[i] / magnitude;

        } else {

          // Only compute the average by dividing by the tokens provided.

          for (let i = 0; i < this.#dimensions; i++) {

            embedding[i] = embedding[i] / tokenIds.length;

        return embedding;

}),

};

    ChromeUtils.addProfilerMarker(

      "StaticEmbeddingsPipeline",

      beforeResponse,

      `Processed ${sequenceCount} sequences with ${tokenCount} tokens.`

);

    if (this.#initializeStart) {

      ChromeUtils.addProfilerMarker(

        "StaticEmbeddingsPipeline",

        this.#initializeStart,

        "Time to first response"

);

      this.#initializeStart = null;

    return response;