Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
const WORKER_URL = "resource://gre/modules/translations/cld-worker.js";
/**
* The options used for when detecting a language.
*
* @typedef {object} DetectionOptions
*
* @property {string} text - The text to analyze.
* @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as
* HTML rather than plain text.
* @property {string} [language] - A string indicating the expected language. For text
* extracted from HTTP documents, this is expected to come from the Content-Language
* header.
* @property {string} [tld] - A string indicating the top-level domain of the document the
* text was extracted from.
* @property {string} [encoding] - A string describing the encoding of the document the
* string was extracted from. Note that, regardless of the value of this property,
* the 'text' property must be a UTF-16 JavaScript string.
*/
/**
* A larger web document can be composed of multiple languages. This object details the
* breakdown of what languages are present in the document, and at what percentages.
* For instance a document could be 70% English and 30% French:
*
* [
* { language: "en", percentage: 70 },
* { language: "fr", percentage: 30 },
* ]
*
* @typedef {object} MultilingualSection
* @property {string} language - BCP 47 language tag, or "un" for unknown.
* @property {number} percent - The integral percentage ranged 0-100.
*/
/**
* @typedef {object} DetectionResult
* @property {string} language - The language code
* @property {boolean} confident - Whether the detector is confident of the result.
* @property {Array<MultilingualSection>} languages - The list of languages detected in
* multilingual content. This is between 0 and 3 languages.
*/
/**
* The length of the substring to pull from the document's text for language
* identification.
*
* This value should ideally be one that is large enough to yield a confident
* identification result without being too large or expensive to extract.
*
* At this time, this value is not driven by statistical data or analysis.
*/
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
/**
* The shorter the text, the less confidence we should have in the result of the language
* identification. Add another heuristic to report the ID as not confident if the length
* of the code points of the text is less than this threshold.
*
* This was determined by plotting a kernel density estimation of the number of times the
* source language had to be changed in the SelectTranslationsPanel vs. the code units in
* the source text.
*
* 0013 code units or less - 49.5% of language changes
* 0036 code units or less - 74.9% of language changes
* 0153 code units or less - 90.0% of language changes
* 0200 code units or less - 91.5% of language changes
* 0427 code units or less - 95.0% of language changes
* 1382 code units or less - 98.0% of language changes
* 3506 code units or less - 99.0% of language changes
*/
const DOC_CONFIDENCE_THRESHOLD = 200;
/**
* An internal class to manage communicating to the worker, and managing its lifecycle.
* It's initialized once below statically to the module.
*/
class WorkerManager {
// Since Emscripten can handle heap growth, but not heap shrinkage, we need to refresh
// the worker after we've processed a particularly large string in order to prevent
// unnecessary resident memory growth.
//
// These values define the cut-off string length and the idle timeout (in milliseconds)
// before destroying a worker. Once a string of the maximum size has been processed,
// the worker is marked for destruction, and is terminated as soon as it has been idle
// for the given timeout.
//
// 1.5MB. This is the approximate string length that forces heap growth for a 2MB heap.
LARGE_STRING = 1.5 * 1024 * 1024;
IDLE_TIMEOUT = 10_000;
/**
* Resolvers for the detection queue.
*
* @type {Array<(result: DetectionResult) => void>}
*/
detectionQueue = [];
/**
* @type {Worker | null}
*/
worker = null;
/**
* @type {Promise<Worker> | null}
*/
workerPromise = null;
/**
* Holds the ID of the current pending idle cleanup setTimeout.
*
* @type {number | null}
*/
idleTimeoutId = null;
/**
* @param {DetectionOptions} options
* @returns {Promise<DetectionResult>}
*/
async detectLanguage(options) {
const worker = await this.getWorker();
const result = await new Promise(resolve => {
this.detectionQueue.push(resolve);
worker.postMessage(options);
});
// We have our asynchronous result from the worker.
//
// Determine if our input was large enough to trigger heap growth,
// or if we're already waiting to destroy the worker when it's
// idle. If so, schedule termination after the idle timeout.
if (
options.text.length >= this.LARGE_STRING ||
this.idleTimeoutId != null
) {
this.flushWorker();
}
return result;
}
/**
* @returns {Promise<Worker>}
*/
getWorker() {
if (!this.workerPromise) {
this.workerPromise = new Promise(resolve => {
let worker = new Worker(WORKER_URL);
worker.onmessage = message => {
if (message.data == "ready") {
resolve(worker);
} else {
/** @type {DetectionResult} */
const detectionResult = message.data;
const resolver = this.detectionQueue.shift();
resolver(detectionResult);
}
};
this.worker = worker;
});
}
return this.workerPromise;
}
/**
* Schedule the current worker to be terminated after the idle timeout.
*/
flushWorker() {
if (this.idleTimeoutId != null) {
clearTimeout(this.idleTimeoutId);
}
this.idleTimeoutId = setTimeout(() => {
if (this.detectionQueue.length) {
// Reschedule the termination as something else was added to the queue.
this.flushWorker();
} else {
// Terminate the worker.
if (this.worker) {
this.worker.terminate();
}
this.worker = null;
this.workerPromise = null;
this.idleTimeoutId = null;
}
}, this.IDLE_TIMEOUT);
}
}
/**
* The worker manager is static to this module. Exported it for unit testing.
*/
export const workerManager = new WorkerManager();
/**
*
*/
export class LanguageDetector {
/**
* Detect the language of a given string.
*
* @param {DetectionOptions | string} options - Either the text to analyze,
* or the options.
* @returns {Promise<DetectionResult>}
*/
static detectLanguage(options) {
if (typeof options == "string") {
options = { text: options };
}
return workerManager.detectLanguage(options);
}
/**
* Attempts to determine the language in which the document's content is written.
*
* @param {Document} document
* @returns {DetectionResult}
*/
static async detectLanguageFromDocument(document) {
// Grab a selection of text.
let encoder = Cu.createDocumentEncoder("text/plain");
encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
let text = encoder
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
.replaceAll("\r", "")
.replaceAll("\n", " ");
const result = await workerManager.detectLanguage({
text,
});
if (text.length < DOC_CONFIDENCE_THRESHOLD) {
result.confident = false;
}
return result;
}
}