Source code
Revision control
Copy as Markdown
Other Tools
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
import { ENGLISH_STOP_WORDS } from "chrome://global/content/ml/StopWords.sys.mjs";
// max number of keywords to extract for each document in corpus
const MAX_WORDS_PER_DOCUMENT = 3;
// min threshold to extract a keyword from document in corpus
const MIN_THRESHOLD_FOR_KEYWORD = 0.05;
/**
* A simple implementation of matrix multiplication
*
* @param {float [][]} A 2D matrix
* @param {float [][]} B 2D matrix
* @returns {float [][]} 2D matrix multiplication result
*/
export function matMul(A, B) {
// Check if multiplication is possible (i.e., number of columns in A == number of rows in B)
if (!A || !B || A[0].length !== B.length) {
throw new Error("Matrix dimensions are incompatible for multiplication.");
}
// Create the result matrix with appropriate dimensions (rows of A, columns of B)
const result = [];
for (let i = 0; i < A.length; i++) {
result[i] = [];
for (let j = 0; j < B[0].length; j++) {
let sum = 0;
// Sum the product of corresponding elements
for (let k = 0; k < A[0].length; k++) {
sum += A[i][k] * B[k][j];
}
result[i][j] = sum;
}
}
return result;
}
/**
* Calculates cosine similarity between two lists of floats
* The lists don't need to be normalized
*
* @param {number []} A first list
* @param {number []} B second list
* @returns {number} cosine similarity value
*/
export function cosSim(A, B) {
if (!A || !B || A.length !== B.length) {
throw new Error("Lists should have same lengths and not be undefined");
}
if (A.length === 0 && A.length === B.length) {
return 0;
}
let dotProduct = 0;
let mA = 0;
let mB = 0;
for (let i = 0; i < A.length; i++) {
dotProduct += A[i] * B[i];
mA += A[i] * A[i];
mB += B[i] * B[i];
}
mA = Math.sqrt(mA);
mB = Math.sqrt(mB);
return mA === 0 || mB === 0 ? 0 : dotProduct / (mA * mB);
}
/**
* This class is used to generate a vectorized count of keywords
* that are not part of the chosen language stopwords
*/
export class CountVectorizer {
constructor(stopWords = "EN") {
this.stopWords = new Set(stopWords === "EN" ? ENGLISH_STOP_WORDS : []);
}
/**
* Tokenize document based on non-stopwords
*
* @param {string} doc String to tokenize
* @returns {string[]} List containing token strings
*/
tokenize(doc) {
return doc
.replace(/[0-9]/g, "") // remove numbers
.split(/\W+/) // split on any non-word character
.map(token => token.toLowerCase()) // convert to lowercase
.filter(word => !this.stopWords.has(word)); // remove stop-words
}
/**
* Fit to corpus and generate the vocabulary
*
* @param {string []} corpus list of text to fit vectorizer to
*/
fit(corpus) {
this.tokenizedCorpus = corpus.map(doc => this.tokenize(doc));
const vocabSet = new Set();
for (let doc of this.tokenizedCorpus) {
for (let token of doc) {
if (token) {
vocabSet.add(token);
}
}
}
const vocab = Array.from(vocabSet);
this.vocabToIdx = {};
this.vocabLength = vocab.length;
for (let i = 0; i < vocab.length; i++) {
this.vocabToIdx[vocab[i]] = i;
}
}
/**
* Transform tokenized text based on non stop-word characters from vocabulary
*
* @param {string [][]} tokenizedCorpus list of tokenized strings to transform
* @returns {float [][]} Mapped token counts per string doc
*/
#transformTokenized(tokenizedCorpus) {
return tokenizedCorpus.map(doc => {
const counts = new Array(this.vocabLength).fill(0);
for (let token of doc) {
if (token in this.vocabToIdx) {
counts[this.vocabToIdx[token]]++;
}
}
return counts;
});
}
/**
* Transform text based on existing vocabulary
*
* @param {string []} corpus list of string to transform
* @returns {float [][]} transformed counts based on tokens
*/
transform(corpus) {
const tokenizedCorpus =
this.tokenizedCorpus || corpus.map(doc => this.tokenize(doc));
return this.#transformTokenized(tokenizedCorpus);
}
/**
* Fit and transform text based on existing vocabulary
*
* @param {string []} corpus list of string to transform
* @returns {float [][]} transformed counts based on tokens
*/
fitTransform(corpus) {
this.fit(corpus);
return this.#transformTokenized(this.tokenizedCorpus);
}
/**
* Extract the most important keywords from corpus
*
* @returns {string []} list of keywords
*/
getFeatureNamesOut() {
return Object.keys(this.vocabToIdx);
}
}
/*
* Simplified ctf-idf implementation based off of
*
* This class identifies keywords important to clusters of documents
* by assigning scores to each keyword in the cluster relative to all clusters
* present in the corpus
*/
export class CTfIdf {
/**
* Generate diagonal matrix using 1D array
*
* @param {float []} array 1D array to create a diagonalized matrix for
* @returns {float [][]} 2D matrix with diagonal values
*/
createDiagonalMatrix(array) {
const n = array.length; // Length of the input array
const matrix = Array.from({ length: n }, () => Array(n).fill(0)); // Create an n x n matrix filled with 0
for (let i = 0; i < n; i++) {
matrix[i][i] = array[i]; // Place elements of the array along the main diagonal
}
return matrix;
}
/**
* Normalize 2D array along the columns
*
* @param {float [][]} array 2D matrix to be normalized
* @returns {float [][]} 2D matrix with normalized values by row
*/
normalize(array) {
// Perform normalization row-wise (axis=1)
for (let i = 0; i < array.length; i++) {
const row = array[i];
// Calculate the L1 norm (sum of absolute values)
const rowSum = row.reduce((sum, value) => sum + Math.abs(value), 0);
if (rowSum !== 0) {
// Prevent division by zero
for (let j = 0; j < row.length; j++) {
row[j] = row[j] / rowSum; // Normalize each element in the row
}
}
}
return array;
}
/**
* Learn the "idf" portion (global term weights)
* by fitting to the count-vectorized array
*
* @param {float [][]} array A matrix of term/token counts.
*/
fit(array) {
if (array.length === 0 || array[0].length === 0) {
throw new Error("Non-Empty 2D array is required");
}
const df = array[0].map((_, colIndex) =>
array.reduce((sum, row) => sum + row[colIndex], 0)
);
const avgNrSamples = Math.floor(
array.reduce(
(total, row) => total + row.reduce((sum, value) => sum + value, 0),
0
) / array.length
);
const idf = df.map(value => Math.log(avgNrSamples / value + 1));
this.idfDiag = this.createDiagonalMatrix(idf);
}
/**
* Find the most important keywords based off the previously
* fitted clusters in the corpus
*
* @param {float [][]} array 2D matrix to transform vectorized matrix
* @returns {float [][]} transformed matrix
*/
transform(array) {
const X = this.normalize(array);
return matMul(X, this.idfDiag);
}
/**
* Apply the fit and transform operations on the same corpus
*
* @param {float [][]} array 2D matrix to fit vectorized matrix
* @returns {float [][]} transformed matrix
*/
fitTransform(array) {
this.fit(array);
return this.transform(array);
}
}
/**
* This class extracts unique keywords from a corpus of documents
* It currently uses the CountVectorizer and CtfIdf classes but can
* be extended as more methods are added
*/
export class KeywordExtractor {
constructor() {
this.cv = new CountVectorizer();
this.ctfIdf = new CTfIdf();
}
/**
* Generate the CtfIdf vector of important keywords and their scores
* and return the n most important keywords
*
* @param {string []} corpus
* @param {int} n max number of keywords to extract
* @returns {string[][]}
*/
fitTransform(corpus, n = MAX_WORDS_PER_DOCUMENT) {
const transformedCorpus = this.cv.fitTransform(corpus);
this.transformedCtfIdf = this.ctfIdf.fitTransform(transformedCorpus);
const { sortedScores, sortedIndices } = this.#getSortedScoresForKeywords();
return this.#getKeywordsPerCluster(sortedIndices, sortedScores, n);
}
/**
* Generate the sorted keywords along with their corresponding scores
*
* @returns {{sortedScores: float [][], sortedIndices: float [][]}}
*/
#getSortedScoresForKeywords() {
const sortedIndices = this.transformedCtfIdf.map(row => {
return row
.map((value, index) => ({ value, index })) // Pair each value with its index
.sort((a, b) => b.value - a.value) // Sort in descending order
.map(item => item.index); // Extract sorted indices
});
const sortedScores = this.transformedCtfIdf.map((row, clusterIndex) => {
return sortedIndices[clusterIndex].map(index => row[index]);
});
return { sortedIndices, sortedScores };
}
/**
* For each cluster of documents in the corpus, extract the important keywords
*
* @param {float [][]} sortedIndices
* @param {float [][]} sortedScores
* @param {int} n max number of keywords to extract
* @returns {string [][]}
*/
#getKeywordsPerCluster(sortedIndices, sortedScores, n) {
const numClusters = this.transformedCtfIdf.length;
const numWords = this.transformedCtfIdf[0].length;
const keywords = this.cv.getFeatureNamesOut();
const keywordsForCluster = [];
for (let cluster = 0; cluster < numClusters; cluster++) {
let topicWords = [];
// fetch the important keywords for each document
for (let topScoreRef = 0; topScoreRef < numWords; topScoreRef++) {
// break if we find a score less than the minimum threshold since the array is "score-sorted"
// also break if we have reach the limit number of keywords to return
if (
sortedScores[cluster][topScoreRef] < MIN_THRESHOLD_FOR_KEYWORD ||
topicWords.length >= n
) {
break;
}
const wordIndex = sortedIndices[cluster][topScoreRef];
topicWords.push(keywords[wordIndex]);
}
// Add the topic words to the list
keywordsForCluster.push(topicWords);
}
return keywordsForCluster;
}
}