Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
// workerManager is exported for tests.
import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
/**
* The length of the substring to pull from the document's text for language
* identification.
*
* This value should ideally be one that is large enough to yield a confident
* identification result without being too large or expensive to extract.
*
* At this time, this value is not driven by statistical data or analysis.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
*/
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
export var workerManager = {
// Since Emscripten can handle heap growth, but not heap shrinkage, we
// need to refresh the worker after we've processed a particularly large
// string in order to prevent unnecessary resident memory growth.
//
// These values define the cut-off string length and the idle timeout
// (in milliseconds) before destroying a worker. Once a string of the
// maximum size has been processed, the worker is marked for
// destruction, and is terminated as soon as it has been idle for the
// given timeout.
//
// 1.5MB. This is the approximate string length that forces heap growth
// for a 2MB heap.
LARGE_STRING: 1.5 * 1024 * 1024,
IDLE_TIMEOUT: 10 * 1000,
detectionQueue: [],
detectLanguage(aParams) {
return this.workerReady
.then(worker => {
return new Promise(resolve => {
this.detectionQueue.push({ resolve });
worker.postMessage(aParams);
});
})
.then(result => {
// We have our asynchronous result from the worker.
//
// Determine if our input was large enough to trigger heap growth,
// or if we're already waiting to destroy the worker when it's
// idle. If so, schedule termination after the idle timeout.
if (
aParams.text.length >= this.LARGE_STRING ||
this._idleTimeout != null
) {
this.flushWorker();
}
return result;
});
},
_worker: null,
_workerReadyPromise: null,
get workerReady() {
if (!this._workerReadyPromise) {
this._workerReadyPromise = new Promise(resolve => {
let worker = new Worker(WORKER_URL);
worker.onmessage = aMsg => {
if (aMsg.data == "ready") {
resolve(worker);
} else {
this.detectionQueue.shift().resolve(aMsg.data);
}
};
this._worker = worker;
});
}
return this._workerReadyPromise;
},
// Holds the ID of the current pending idle cleanup setTimeout.
_idleTimeout: null,
// Schedule the current worker to be terminated after the idle timeout.
flushWorker() {
if (this._idleTimeout != null) {
clearTimeout(this._idleTimeout);
}
this._idleTimeout = setTimeout(
this._flushWorker.bind(this),
this.IDLE_TIMEOUT
);
},
// Immediately terminate the worker, as long as there no pending
// results. Otherwise, reschedule termination until after the next
// idle timeout.
_flushWorker() {
if (this.detectionQueue.length) {
this.flushWorker();
} else {
if (this._worker) {
this._worker.terminate();
}
this._worker = null;
this._workerReadyPromise = null;
this._idleTimeout = null;
}
},
};
export var LanguageDetector = {
/**
* Detect the language of a given string.
*
* The argument may be either a string containing the text to analyze,
* or an object with the following properties:
*
* - 'text' The text to analyze.
*
* - 'isHTML' (optional) A boolean, indicating whether the text
* should be analyzed as HTML rather than plain text.
*
* - 'language' (optional) A string indicating the expected language.
* For text extracted from HTTP documents, this is expected to
* come from the Content-Language header.
*
* - 'tld' (optional) A string indicating the top-level domain of the
* document the text was extracted from.
*
* - 'encoding' (optional) A string describing the encoding of the
* document the string was extracted from. Note that, regardless
* of the value of this property, the 'text' property must be a
* UTF-16 JavaScript string.
*
* @returns {Promise<Object>}
* @resolves When detection is finished, with a object containing
* these fields:
* - 'language' (string with a language code)
* - 'confident' (boolean) Whether the detector is confident of the
* result.
* - 'languages' (array) An array of up to three elements, containing
* the most prevalent languages detected. It contains a
* 'languageCode' property, containing the ISO language code of
* the language, and a 'percent' property, describing the
* approximate percentage of the input which is in that language.
* For text of an unknown language, the result may contain an
* entry with the languge code 'un', indicating the percent of
* the text which is unknown.
*/
detectLanguage(aParams) {
if (typeof aParams == "string") {
aParams = { text: aParams };
}
return workerManager.detectLanguage(aParams);
},
/**
* Attempts to determine the language in which the document's content is written.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
* @returns {string | null}
*/
async detectLanguageFromDocument(aDocument) {
// Grab a selection of text.
let encoder = Cu.createDocumentEncoder("text/plain");
encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
let text = encoder
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
.replaceAll("\r", "")
.replaceAll("\n", " ");
const { language, confident } = await workerManager.detectLanguage({
text,
});
workerManager.flushWorker();
return confident ? language : null;
},
};