LanguageDetector.sys.mjs

Enable keyboard shortcuts

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this file,

 * You can obtain one at http://mozilla.org/MPL/2.0/. */

// workerManager is exported for tests.

import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";

const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";

/**

 * The length of the substring to pull from the document's text for language

 * identification.

 * This value should ideally be one that is large enough to yield a confident

 * identification result without being too large or expensive to extract.

 * At this time, this value is not driven by statistical data or analysis.

 * For the moment, while we investigate which language identification library

 * we would like to use, keep this logic in sync with language-id-engine.sys.mjs

*/

const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;

export var workerManager = {

  // Since Emscripten can handle heap growth, but not heap shrinkage, we

  // need to refresh the worker after we've processed a particularly large

  // string in order to prevent unnecessary resident memory growth.

//

  // These values define the cut-off string length and the idle timeout

  // (in milliseconds) before destroying a worker. Once a string of the

  // maximum size has been processed, the worker is marked for

  // destruction, and is terminated as soon as it has been idle for the

  // given timeout.

//

  // 1.5MB. This is the approximate string length that forces heap growth

  // for a 2MB heap.

  LARGE_STRING: 1.5 * 1024 * 1024,

  IDLE_TIMEOUT: 10 * 1000,

  detectionQueue: [],

  detectLanguage(aParams) {

    return this.workerReady

      .then(worker => {

        return new Promise(resolve => {

          this.detectionQueue.push({ resolve });

          worker.postMessage(aParams);

});

})

      .then(result => {

        // We have our asynchronous result from the worker.

//

        // Determine if our input was large enough to trigger heap growth,

        // or if we're already waiting to destroy the worker when it's

        // idle. If so, schedule termination after the idle timeout.

        if (

          aParams.text.length >= this.LARGE_STRING ||

          this._idleTimeout != null

) {

          this.flushWorker();

        return result;

});

},

  _worker: null,

  _workerReadyPromise: null,

  get workerReady() {

    if (!this._workerReadyPromise) {

      this._workerReadyPromise = new Promise(resolve => {

        let worker = new Worker(WORKER_URL);

        worker.onmessage = aMsg => {

          if (aMsg.data == "ready") {

            resolve(worker);

          } else {

            this.detectionQueue.shift().resolve(aMsg.data);

};

        this._worker = worker;

});

    return this._workerReadyPromise;

},

  // Holds the ID of the current pending idle cleanup setTimeout.

  _idleTimeout: null,

  // Schedule the current worker to be terminated after the idle timeout.

  flushWorker() {

    if (this._idleTimeout != null) {

      clearTimeout(this._idleTimeout);

    this._idleTimeout = setTimeout(

      this._flushWorker.bind(this),

      this.IDLE_TIMEOUT

);

},

  // Immediately terminate the worker, as long as there no pending

  // results. Otherwise, reschedule termination until after the next

  // idle timeout.

  _flushWorker() {

    if (this.detectionQueue.length) {

      this.flushWorker();

    } else {

      if (this._worker) {

        this._worker.terminate();

      this._worker = null;

      this._workerReadyPromise = null;

      this._idleTimeout = null;

},

};

export var LanguageDetector = {

/**

   * Detect the language of a given string.

   * The argument may be either a string containing the text to analyze,

   * or an object with the following properties:

   *  - 'text' The text to analyze.

   *  - 'isHTML' (optional) A boolean, indicating whether the text

   *      should be analyzed as HTML rather than plain text.

   *  - 'language' (optional) A string indicating the expected language.

   *      For text extracted from HTTP documents, this is expected to

   *      come from the Content-Language header.

   *  - 'tld' (optional) A string indicating the top-level domain of the

   *      document the text was extracted from.

   *  - 'encoding' (optional) A string describing the encoding of the

   *      document the string was extracted from. Note that, regardless

   *      of the value of this property, the 'text' property must be a

   *      UTF-16 JavaScript string.

   * @returns {Promise<Object>}

   * @resolves When detection is finished, with a object containing

   * these fields:

   *  - 'language' (string with a language code)

   *  - 'confident' (boolean) Whether the detector is confident of the

   *      result.

   *  - 'languages' (array) An array of up to three elements, containing

   *      the most prevalent languages detected. It contains a

   *      'languageCode' property, containing the ISO language code of

   *      the language, and a 'percent' property, describing the

   *      approximate percentage of the input which is in that language.

   *      For text of an unknown language, the result may contain an

   *      entry with the languge code 'un', indicating the percent of

   *      the text which is unknown.

*/

  detectLanguage(aParams) {

    if (typeof aParams == "string") {

      aParams = { text: aParams };

    return workerManager.detectLanguage(aParams);

},

/**

   * Attempts to determine the language in which the document's content is written.

   * For the moment, while we investigate which language identification library

   * we would like to use, keep this logic in sync with language-id-engine.sys.mjs

   * @returns {string | null}

*/

  async detectLanguageFromDocument(aDocument) {

    // Grab a selection of text.

    let encoder = Cu.createDocumentEncoder("text/plain");

    encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);

    let text = encoder

      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)

      .replaceAll("\r", "")

      .replaceAll("\n", " ");

    const { language, confident } = await workerManager.detectLanguage({

      text,

});

    workerManager.flushWorker();

    return confident ? language : null;

},

};