Revision control

Copy as Markdown

Other Tools

const JsMIMEmimeutils = function () {
/**
* Decode a quoted-printable buffer into a binary string.
*
* @param {BinaryString} buffer - The string to decode.
* @returns {BinaryString[]} The first element of the array is the decoded
* string. The second element is always the empty string.
*/
function decode_qp(buffer) {
// Unlike base64, quoted-printable isn't stateful across multiple lines, so
// there is no need to buffer input, so we can always ignore more.
const decoded = buffer.replace(
// Replace either =<hex><hex> or =<wsp>CRLF
/=([0-9A-F][0-9A-F]|[ \t]*(\r\n|[\r\n]|$))/gi,
function (match, param) {
// If trailing text matches [ \t]*CRLF, drop everything, since it's a
// soft line break.
if (param.trim().length == 0) {
return "";
}
return String.fromCharCode(parseInt(param, 16));
}
);
return [decoded, ""];
}
/**
* Decode a base64 buffer into a binary string. Unlike window.atob, the buffer
* may contain non-base64 characters that will be ignored.
*
* @param {BinaryString} buffer - The string to decode.
* @param {boolean} more - If true, we expect that this function could be
* called again and should retain extra data. If false, we should flush
* all pending output.
* @returns {BinaryString[]} The first element of the array is the decoded
* string. The second element contains the data that could not be decoded
* and needs to be retained for the next call.
*/
function decode_base64(buffer, more) {
// Drop all non-base64 characters
let sanitize = buffer.replace(/[^A-Za-z0-9+\/=]/g, "");
// Remove harmful `=' chars in the middle.
sanitize = sanitize.replace(/=+([A-Za-z0-9+\/])/g, "$1");
// We need to encode in groups of 4 chars. If we don't have enough, leave the
// excess for later. If there aren't any more, drop enough to make it 4.
const excess = sanitize.length % 4;
if (excess != 0 && more) {
buffer = sanitize.slice(-excess);
} else {
buffer = "";
}
sanitize = sanitize.substring(0, sanitize.length - excess);
// Delete all unnecessary '====' in padding.
sanitize = sanitize.replace(/(====)+$/g, "");
// Use the atob function we (ought to) have in global scope.
return [atob(sanitize), buffer];
}
/**
* Converts a binary string into a Uint8Array buffer.
*
* @param {BinaryString} buffer - The string to convert.
* @returns {Uint8Array} the converted data.
*/
function stringToTypedArray(buffer) {
var typedarray = new Uint8Array(buffer.length);
for (var i = 0; i < buffer.length; i++) {
typedarray[i] = buffer.charCodeAt(i);
}
return typedarray;
}
/**
* Converts a Uint8Array buffer to a binary string.
*
* @param {Uint8Array} buffer - The Uint8Array to convert.
* @returns {string} the converted string.
*/
function typedArrayToString(buffer) {
var string = "";
for (let i = 0; i < buffer.length; i += 100) {
string += String.fromCharCode.apply(
undefined,
buffer.subarray(i, i + 100)
);
}
return string;
}
/** A list of month names for Date parsing. */
const MONTH_NAMES = [
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec",
];
// MimeTextDecoder allows polyfill. Needed for utf-7 support.
const MimeTextDecoder = TextDecoder;
return {
decode_base64,
decode_qp,
MONTH_NAMES,
stringToTypedArray,
typedArrayToString,
MimeTextDecoder,
};
};
const mimeutils = JsMIMEmimeutils();
/**
* This file implements knowledge of how to encode or decode structured headers
* for several key headers. It is not meant to be used externally to jsmime.
*/
const JsMIMEstructuredHeaders = function () {
var structuredDecoders = new Map();
var structuredEncoders = new Map();
var preferredSpellings = new Map();
function addHeader(headerName, decoder, encoder) {
var lowerName = headerName.toLowerCase();
structuredDecoders.set(lowerName, decoder);
structuredEncoders.set(lowerName, encoder);
preferredSpellings.set(lowerName, headerName);
}
// Addressing headers: We assume that they can be specified in 1* form (this is
// false for From, but it's close enough to the truth that it shouldn't matter).
// There is no need to specialize the results for the header, so just pun it
// back to parseAddressingHeader.
function parseAddress(value) {
const headerparser = this;
return value.reduce(function (results, header) {
return results.concat(headerparser.parseAddressingHeader(header, true));
}, []);
}
function writeAddress(value) {
// Make sure the input is an array (accept a single entry)
if (!Array.isArray(value)) {
value = [value];
}
this.addAddresses(value);
}
// Addressing headers from RFC 5322:
addHeader("Bcc", parseAddress, writeAddress);
addHeader("Cc", parseAddress, writeAddress);
addHeader("From", parseAddress, writeAddress);
addHeader("Reply-To", parseAddress, writeAddress);
addHeader("Resent-Bcc", parseAddress, writeAddress);
addHeader("Resent-Cc", parseAddress, writeAddress);
addHeader("Resent-From", parseAddress, writeAddress);
addHeader("Resent-Reply-To", parseAddress, writeAddress);
addHeader("Resent-Sender", parseAddress, writeAddress);
addHeader("Resent-To", parseAddress, writeAddress);
addHeader("Sender", parseAddress, writeAddress);
addHeader("To", parseAddress, writeAddress);
// From RFC 5536:
addHeader("Approved", parseAddress, writeAddress);
// From RFC 3798:
addHeader("Disposition-Notification-To", parseAddress, writeAddress);
// Non-standard headers:
addHeader("Delivered-To", parseAddress, writeAddress);
addHeader("Return-Receipt-To", parseAddress, writeAddress);
addHeader("Mail-Reply-To", parseAddress, writeAddress);
addHeader("Mail-Followup-To", parseAddress, writeAddress);
// Parameter-based headers. Note that all parameters are slightly different, so
// we use slightly different variants here.
function parseParameterHeader(value, do2231, do2047) {
// Only use the first header for parameters; ignore subsequent redefinitions.
return this.parseParameterHeader(value[0], do2231, do2047);
}
// RFC 2045
function parseContentType(contentType) {
let params = parseParameterHeader.call(this, contentType, false, false);
const origtype = params.preSemi;
let parts = origtype.split("/");
if (parts.length != 2) {
// Malformed. Return to text/plain. Evil, ain't it?
params = new Map();
parts = ["text", "plain"];
}
const mediatype = parts[0].toLowerCase();
const subtype = parts[1].toLowerCase();
const type = mediatype + "/" + subtype;
const structure = new Map();
structure.mediatype = mediatype;
structure.subtype = subtype;
structure.type = type;
for (const [key, value] of params) {
structure.set(key.toLowerCase(), value);
}
return structure;
}
structuredDecoders.set("Content-Type", parseContentType);
// Unstructured headers (just decode RFC 2047 for the first header value)
function parseUnstructured(values) {
return this.decodeRFC2047Words(values[0]);
}
function writeUnstructured(value) {
this.addUnstructured(value);
}
// Message-ID headers.
function parseMessageID(values) {
// TODO: Proper parsing support for these headers is currently unsupported).
return this.decodeRFC2047Words(values[0]);
}
function writeMessageID(value) {
// TODO: Proper parsing support for these headers is currently unsupported).
this.addUnstructured(value);
}
// RFC 5322
addHeader("Comments", parseUnstructured, writeUnstructured);
addHeader("Keywords", parseUnstructured, writeUnstructured);
addHeader("Subject", parseUnstructured, writeUnstructured);
// RFC 2045
addHeader("MIME-Version", parseUnstructured, writeUnstructured);
addHeader("Content-Description", parseUnstructured, writeUnstructured);
// RFC 7231
addHeader("User-Agent", parseUnstructured, writeUnstructured);
// Date headers
function parseDate(values) {
return this.parseDateHeader(values[0]);
}
function writeDate(value) {
this.addDate(value);
}
// RFC 5322
addHeader("Date", parseDate, writeDate);
addHeader("Resent-Date", parseDate, writeDate);
// RFC 5536
addHeader("Expires", parseDate, writeDate);
addHeader("Injection-Date", parseDate, writeDate);
addHeader("NNTP-Posting-Date", parseDate, writeDate);
// RFC 5322
addHeader("Message-ID", parseMessageID, writeMessageID);
addHeader("Resent-Message-ID", parseMessageID, writeMessageID);
// Miscellaneous headers (those that don't fall under the above schemes):
// RFC 2047
structuredDecoders.set("Content-Transfer-Encoding", function (values) {
return values[0].toLowerCase();
});
structuredEncoders.set("Content-Transfer-Encoding", writeUnstructured);
/**
* Some clients like outlook.com send non-compliant References headers that
* separate values using commas. Also, some clients don't separate References
* with spaces, since these are optional according to RFC2822. So here we
* preprocess these headers (see bug 1154521 and bug 1197686).
*
* @param {string[]} values
* @returns {string} the message ids; properly space separated.
*/
function preprocessMessageIDs(values) {
return values[0].match(/<[^>]*>/g)?.join(" ");
}
structuredDecoders.set("References", preprocessMessageIDs);
structuredDecoders.set("In-Reply-To", preprocessMessageIDs);
return Object.freeze({
decoders: structuredDecoders,
encoders: structuredEncoders,
spellings: preferredSpellings,
});
};
const structuredHeaders = JsMIMEstructuredHeaders();
/**
* Implements the structured decoding of message header fields.
*/
const JsMIMEheaderparser = function () {
/**
* This is the API that we ultimately return.
*
* We define it as a global here, because we need to pass it as a |this|
* argument to a few functions.
*/
var headerparser = {};
/**
* Clean up characters that could cause display problems since they
* are not displayed.
*
* @param {string} token - The string to be cleaned.
* @returns {string} The cleaned string.
*/
function cleanToken(token) {
// Replace problematic characters so we don't get unexpected behavior
// down the line. These fall into a few categories:
// A) "Separator, space" (Zs),
// B) "Mark, Nonspacing" (Mn)
// C) "Other, Control" (Cc)
// D) "Other, Format" (Cf)
// E) "Symbol, Other"
// Unfortunately, no support for the needed regexp Unicode property escapes
// in our engine. So we need to hand-roll it. Used the regexpu tool for
// This should be updated regularly, to take into account new additions
// to the unicode standard. Last updated July 2019.
// For a full list of categories, see http://unicode.org/Public//5.0.0/ucd/UCD.html.
// -- case A: /\p{Zs}/u
token = token.replace(/[\xA0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, " ");
// -- case B: /\p{Mn}/u
// This is a bit more complicated as some of them could be "real", so we'll
// only remove the ones that are known to show as blank.
token = token.replace(
/[\u034F\u17B4\u17B5\u180B-\u180D\uFE00-\uFE0F]/g,
""
);
// \uE0100-\uE01EF need to be written using their surrogate code point pairs
// until extended Unicode escapes are supported in regexps.
token = token.replace(/\uDB40[\uDD00-\uDDEF]/g, "");
// -- case C: /\p{Cc}/u, except Tab/LF/CR
// eslint-disable-next-line no-control-regex
token = token.replace(/(?![\t\n\r])[\0-\x1F\x7F-\x9F]/g, "");
// -- case D: /\p{Cf}/u
// Remove all of these except for \u0600-\u0605.
// XXX: We replace these with spaces (" "), not empty strings ("").
// Notably, for zero width space (\u200B) replacing with empty space
// would later drop real spaces surrounding it. Dunno why.
token = token.replace(
/(?:[\xAD\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD80D[\uDC30-\uDC38]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F])/g,
" "
);
// -- case E: problematic symbols
// Replace U+2800 BRAILLE PATTERN BLANK with space.
token = token.replace(/\u2800/g, " ");
return token;
}
/**
* Tokenizes a message header into a stream of tokens as a generator.
*
* The low-level tokens are meant to be loosely correspond to the tokens as
* defined in RFC 5322. For reasons of saner error handling, however, the two
* definitions are not exactly equivalent. The tokens we emit are the following:
* 1. Special delimiters: Any char in the delimiters string is emitted as a
* string by itself. Parsing parameter headers, for example, would use ";="
* for the delimiter string.
* 2. Quoted-strings (if opt.qstring is true): A string which is surrounded by
* double quotes. Escapes in the string are omitted when returning.
* 3. Domain Literals (if opt.dliteral is true): A string which matches the
* dliteral construct in RFC 5322. Escapes here are NOT omitted.
* 4. Comments (if opt.comments is true): Comments are handled specially. In
* practice, decoding the comments in To headers appears to be necessary, so
* comments are not stripped in the output value. Instead, they are emitted
* as if they are a special delimiter. However, all delimiters found within a
* comment are returned as if they were a quoted string, so that consumers
* ignore delimiters within comments. If ignoring comment text completely is
* desired, upon seeing a "(" token, consumers should ignore all tokens until
* a matching ")" is found (note that comments can be nested).
* 5. RFC 2047 encoded-words (if opts.rfc2047 is true): These are strings which
* are the decoded contents of RFC 2047's =?UTF-8?Q?blah?=-style words.
* 6. Atoms: Atoms are defined not in the RFC 5322 sense, but rather as the
* longest sequence of characters that is neither whitespace nor any of the
* special characters above.
*
* The intended interpretation of the stream of output tokens is that they are
* the portions of text which can be safely wrapped in whitespace with no ill
* effect. The output tokens are either strings (which represent individual
* delimiter tokens) or instances of a class that has a customized .toString()
* for output (for quoted strings, atoms, domain literals, and encoded-words).
* Checking for a delimiter MUST use the strictly equals operator (===). For
* example, the proper way to call this method is as follows:
*
* for (let token of getHeaderTokens(rest, ";=", opts)) {
* if (token === ';') {
* // This represents a literal ';' in the string
* } else if (token === '=') {
* // This represents a literal '=' in the string
* } else {
* // If a ";" qstring was parsed, we fall through to here!
* token = token.toString();
* }
* }
*
* This method does not properly tokenize 5322 in all corner cases; however,
* this is equivalent in those corner cases to an older header parsing
* algorithm, so the algorithm should be correct for all real-world cases. The
* corner cases are as follows:
* 1. Quoted-strings and domain literals are parsed even if they are within a
* comment block (we effectively treat ctext as containing qstring).
* 2. WSP need not be between a qstring and an atom (a"b" produces two tokens,
* a and b). This is an error case, though.
* 3. Legacy comments as display names: We recognize address fields with
* comments, and (a) either drop them if inside addr-spec or (b) preserve
* them as part of the display-name if not. If the display-name is empty
* while the last comment is not, we assume it's the legacy form above and
* take the comment content as the display-name.
*
* @param {string} value - The header value, post charset conversion but
* before RFC 2047 decoding, to be parsed.
* @param {string} delimiters A set of delimiters to include as individual
* tokens.
* @param {object} opts - A set of options selecting what to parse.
* @param {boolean} [opts.qstring] - If true, recognize quoted strings.
* @param {boolean} [opts.dliteral] If true, recognize domain literals.
* @param {boolean} [opts.comments] If true, recognize comments.
* @param {boolean} [opts.rfc2047] - If true, parse and decode RFC 2047
* encoded-words.
* @returns {(Token|string)[]} An array of Token objects (which have a toString
* method returning their value) or String objects (representing delimiters).
*/
/* eslint-disable complexity */
function getHeaderTokens(value, delimiters, opts) {
// The array of parsed tokens. This method used to be a generator, but it
// appears that generators are poorly optimized in current engines, so it was
// converted to not be one.
const tokenList = [];
// Represents a non-delimiter token.
function Token(token) {
// Unescape all quoted pairs. Any trailing \ is deleted.
this.token = token.replace(/\\(.?)/g, "$1");
}
Token.prototype.toString = function () {
return this.token;
};
// The start of the current token (e.g., atoms, strings)
let tokenStart = undefined;
// The set of whitespace characters, as defined by RFC 5322
const wsp = " \t\r\n";
// If we are a domain literal ([]) or a quoted string ("), this is set to the
// character to look for at the end.
let endQuote = undefined;
// The current depth of comments, since they can be nested. A value 0 means we
// are not in a comment.
let commentDepth = 0;
// Iterate over every character one character at a time.
for (let i = 0; i < value.length; i++) {
const ch = value[i];
// If we see a \, no matter what context we are in, ignore the next
// character.
if (ch == "\\") {
i++;
continue;
}
// If we are in a qstring or a dliteral, process the character only if it is
// what we are looking for to end the quote.
if (endQuote !== undefined) {
if (ch == endQuote && ch == '"') {
// Quoted strings don't include their delimiters.
let text = value.slice(tokenStart + 1, i);
// If RFC 2047 is enabled, always decode the qstring.
if (opts.rfc2047) {
text = decodeRFC2047Words(text);
}
tokenList.push(new Token(text));
endQuote = undefined;
tokenStart = undefined;
} else if (ch == endQuote && ch == "]") {
// Domain literals include their delimiters.
tokenList.push(new Token(value.slice(tokenStart, i + 1)));
endQuote = undefined;
tokenStart = undefined;
}
// Avoid any further processing.
continue;
}
// If we can match the RFC 2047 encoded-word pattern, we need to decode the
// entire word or set of words.
if (
opts.rfc2047 &&
ch == "=" &&
i + 1 < value.length &&
value[i + 1] == "?"
) {
// RFC 2047 tokens separated only by whitespace are conceptually part of
// the same output token, so we need to decode them all at once.
const encodedWordsRE = /([ \t\r\n]*=\?[^?]*\?[BbQq]\?[^?]*\?=)+/;
const result = encodedWordsRE.exec(value.slice(i));
if (result !== null) {
// If we were in the middle of a prior token (i.e., something like
// foobar=?UTF-8?Q?blah?=), yield the previous segment as a token.
if (tokenStart !== undefined) {
tokenList.push(new Token(value.slice(tokenStart, i)));
tokenStart = undefined;
}
// Find out how much we need to decode...
const encWordsLen = result[0].length;
const string = decodeRFC2047Words(
value.slice(i, i + encWordsLen),
"UTF-8"
);
// Don't make a new Token variable, since we do not want to unescape the
// decoded string.
tokenList.push({
toString() {
return string;
},
});
// Skip everything we decoded. The -1 is because we don't want to
// include the starting character.
i += encWordsLen - 1;
continue;
}
// If we are here, then we failed to match the simple 2047 encoded-word
// regular expression, despite the fact that it matched the =? at the
// beginning. Fall through and treat the text as if we aren't trying to
// decode RFC 2047.
}
// If we reach this point, we're not inside of quoted strings, domain
// literals, or RFC 2047 encoded-words. This means that the characters we
// parse are potential delimiters (unless we're in comments, where
// everything starts to go really wonky). Several things could happen,
// depending on the kind of character we read and whether or not we were in
// the middle of a token. The three values here tell us what we could need
// to do at this point:
// tokenIsEnding: The current character is not able to be accumulated to an
// atom, so we need to flush the atom if there is one.
// tokenIsStarting: The current character could begin an atom (or
// anything that requires us to mark the starting point), so we need to save
// the location.
// isSpecial: The current character is a delimiter that needs to be output.
let tokenIsEnding = false,
tokenIsStarting = false,
isSpecial = false;
if (wsp.includes(ch)) {
// Whitespace ends current tokens, doesn't emit anything.
tokenIsEnding = true;
} else if (commentDepth == 0 && delimiters.includes(ch)) {
// Delimiters end the current token, and need to be output. They do not
// apply within comments.
tokenIsEnding = true;
isSpecial = true;
} else if (opts.qstring && ch == '"') {
// Quoted strings end the last token and start a new one.
tokenIsEnding = true;
tokenIsStarting = true;
endQuote = ch;
} else if (opts.dliteral && ch == "[") {
// Domain literals end the last token and start a new one.
tokenIsEnding = true;
tokenIsStarting = true;
endQuote = "]";
} else if (opts.comments && ch == "(") {
// Comments are nested (oh joy). We only really care for the outer
// delimiter, though, which also ends the prior token and needs to be
// output if the consumer requests it.
commentDepth++;
if (commentDepth == 1) {
tokenIsEnding = true;
isSpecial = true;
} else {
tokenIsStarting = true;
}
} else if (opts.comments && ch == ")") {
// Comments are nested (oh joy). We only really care for the outer
// delimiter, though, which also ends the prior token and needs to be
// output if the consumer requests it.
if (commentDepth > 0) {
commentDepth--;
}
if (commentDepth == 0) {
tokenIsEnding = true;
isSpecial = true;
} else {
tokenIsStarting = true;
}
} else {
// Not a delimiter, whitespace, comment, domain literal, or quoted string.
// Must be part of an atom then!
tokenIsStarting = true;
}
// If our analysis concluded that we closed an open token, and there is an
// open token, then yield that token.
if (tokenIsEnding && tokenStart !== undefined) {
tokenList.push(new Token(value.slice(tokenStart, i)));
tokenStart = undefined;
}
// If we need to output a delimiter, do so.
if (isSpecial) {
tokenList.push(ch);
}
// If our analysis concluded that we could open a token, and no token is
// opened yet, then start the token.
if (tokenIsStarting && tokenStart === undefined) {
tokenStart = i;
}
}
// That concludes the loop! If there is a currently open token, close that
// token now.
if (tokenStart !== undefined) {
// Error case: a partially-open quoted string is assumed to have a trailing
// " character.
if (endQuote == '"') {
tokenList.push(new Token(value.slice(tokenStart + 1)));
} else {
tokenList.push(new Token(value.slice(tokenStart)));
}
}
return tokenList;
}
/* eslint-enable complexity */
/**
* Convert a header value into UTF-16 strings by attempting to decode as UTF-8
* or another legacy charset. If the header is valid UTF-8, it will be decoded
* as UTF-8; if it is not, the fallbackCharset will be attempted instead.
*
* @param {string} headerValue - The header (as a binary string) to attempt
* to convert to UTF-16.
* @param {string} [fallbackCharset] The optional charset to try if UTF-8
* doesn't work.
* @returns {string} the UTF-16 representation of the string above.
*/
function convert8BitHeader(headerValue, fallbackCharset) {
// Only attempt to convert the headerValue if it contains non-ASCII
// characters.
if (/[\x80-\xff]/.exec(headerValue)) {
// First convert the value to a typed-array for mimeutils.MimeTextDecoder.
const typedarray = mimeutils.stringToTypedArray(headerValue);
// Don't try UTF-8 as fallback (redundant), and don't try UTF-16 or UTF-32
// either, since they radically change header interpretation.
// If we have a fallback charset, we want to know if decoding will fail;
// otherwise, we want to replace with substitution chars.
const hasFallback =
fallbackCharset && !fallbackCharset.toLowerCase().startsWith("utf");
const utf8Decoder = new mimeutils.MimeTextDecoder("utf-8", {
fatal: hasFallback,
});
try {
headerValue = utf8Decoder.decode(typedarray);
} catch (e) {
// Failed, try the fallback
try {
const decoder = new mimeutils.MimeTextDecoder(fallbackCharset, {
fatal: false,
});
headerValue = decoder.decode(typedarray);
} catch (ex) {}
}
}
return cleanToken(headerValue);
}
/**
* Decodes all RFC 2047 encoded-words in the input string. The string does not
* necessarily have to contain any such words. This is useful, for example, for
* parsing unstructured headers.
*
* @param {string} headerValue The header which may contain RFC 2047 encoded-
* words.
* @returns {string} a full UTF-16 string with all encoded words expanded.
*/
function decodeRFC2047Words(headerValue) {
// Unfortunately, many implementations of RFC 2047 encoding are actually wrong
// in that they split over-long encoded words without regard for whether or
// not the split point is in the middle of a multibyte character. Therefore,
// we need to be able to handle these situations gracefully. This is done by
// using the decoder in streaming mode so long as the next token is another
// 2047 token with the same charset.
let lastCharset = "",
currentDecoder = undefined;
/**
* Decode a single RFC 2047 token. This function is inline so that we can
* easily close over the lastCharset/currentDecoder variables, needed for
* handling bad RFC 2047 productions properly.
* E.g. =?iso-8859-1?q?this=20is=20some=20text?=
*/
function decode2047Token(token, isLastToken) {
const tokenParts = token.split("?");
// If it's obviously not a valid token, return false immediately.
if (tokenParts.length != 5 || tokenParts[4] != "=") {
return false;
}
// The charset parameter is defined in RFC 2231 to be charset or
// charset*language. We only care about the charset here, so ignore any
// language parameter that gets passed in.
const charset = tokenParts[1].split("*", 1)[0];
const encoding = tokenParts[2],
text = tokenParts[3];
let buffer;
if (encoding == "B" || encoding == "b") {
// Decode base64. If there's any non-base64 data, treat the string as
// an illegal token.
if (/[^ A-Za-z0-9+\/=]/.exec(text)) {
return false;
}
// Decode the string
buffer = mimeutils.decode_base64(text, false)[0];
} else if (encoding == "Q" || encoding == "q") {
// Q encoding here looks a lot like quoted-printable text. The differences
// between quoted-printable and this are that quoted-printable allows you
// to quote newlines (this doesn't), while this replaces spaces with _.
// We can reuse the decode_qp code here, since newlines are already
// stripped from the header. There is one edge case that could trigger a
// false positive, namely when you have a single = or an = followed by
// whitespace at the end of the string. Such an input string is already
// malformed to begin with, so stripping the = and following input in that
// case should not be an important loss.
buffer = mimeutils.decode_qp(text.replace(/_/g, " "))[0];
} else {
return false;
}
// Make the buffer be a typed array for what follows
const stringBuffer = buffer;
buffer = mimeutils.stringToTypedArray(buffer);
// If we cannot reuse the last decoder, flush out whatever remains.
var output = "";
if (charset != lastCharset && currentDecoder) {
output += currentDecoder.decode();
currentDecoder = null;
}
// Initialize the decoder for this token.
lastCharset = charset;
if (!currentDecoder) {
try {
currentDecoder = new mimeutils.MimeTextDecoder(charset, {
fatal: false,
});
} catch (e) {
// We don't recognize the charset, so give up.
return false;
}
}
// Convert this token with the buffer. Note the stream parameter--although
// RFC 2047 tokens aren't supposed to break in the middle of a multibyte
// character, a lot of software messes up and does so because it's hard not
// to (see headeremitter.js for exactly how hard!).
// We must not stream ISO-2022-JP if the buffer switches back to
// the ASCII state, that is, ends in "ESC(B".
// Also, we shouldn't do streaming on the last token.
let doStreaming;
if (
isLastToken ||
(charset.toUpperCase() == "ISO-2022-JP" &&
stringBuffer.endsWith("\x1B(B"))
) {
doStreaming = { stream: false };
} else {
doStreaming = { stream: true };
}
return output + currentDecoder.decode(buffer, doStreaming);
}
// The first step of decoding is to split the string into RFC 2047 and
// non-RFC 2047 tokens. RFC 2047 tokens look like the following:
// =?charset?c?text?=, where c is one of B, b, Q, and q. The split regex does
// some amount of semantic checking, so that malformed RFC 2047 tokens will
// get ignored earlier.
const components = headerValue.split(/(=\?[^?]*\?[BQbq]\?[^?]*\?=)/);
// Find last RFC 2047 token.
let lastRFC2047Index = -1;
for (let i = 0; i < components.length; i++) {
if (components[i].substring(0, 2) == "=?") {
lastRFC2047Index = i;
}
}
for (let i = 0; i < components.length; i++) {
if (components[i].substring(0, 2) == "=?") {
const decoded = decode2047Token(components[i], i == lastRFC2047Index);
if (decoded !== false) {
// If 2047 decoding succeeded for this bit, rewrite the original value
// with the proper decoding.
components[i] = decoded;
// We're done processing, so continue to the next link.
continue;
}
} else if (/^[ \t\r\n]*$/.exec(components[i])) {
// Whitespace-only tokens get squashed into nothing, so 2047 tokens will
// be concatenated together.
components[i] = "";
continue;
}
// If there was stuff left over from decoding the last 2047 token, flush it
// out.
lastCharset = "";
if (currentDecoder) {
components[i] = currentDecoder.decode() + components[i];
currentDecoder = null;
}
}
// After the for loop, we'll have a set of decoded strings. Concatenate them
// together to make the return value.
return cleanToken(components.join(""));
}
// Structured field decoders
// -------------------------
/**
* Extract a list of addresses from a header which matches the RFC 5322
* address-list production, possibly doing RFC 2047 decoding along the way.
*
* The output of this method is an array of elements corresponding to the
* addresses and the groups in the input header. An address is represented by
* an object of the form:
* {
* name: The display name of the address
* email: The address of the object
* }
* while a group is represented by an object of the form:
* {
* name: The display name of the group
* group: An array of address object for members in the group.
* }
*
* @param {string} header - The MIME header text to be parsed
* @param {boolean} doRFC2047 If true, decode RFC 2047 parameters found in the
* header.
* @returns {(Address|Group)[]} An array of the addresses found in the header,
* where each element is of the form mentioned above.
*/
function parseAddressingHeader(header, doRFC2047) {
// Default to true
if (doRFC2047 === undefined) {
doRFC2047 = true;
}
// The final (top-level) results list to append to.
let results = [];
// Temporary results
let addrlist = [];
// Build up all of the values
let displayName = "",
groupName = "",
localPart = "",
address = "",
comment = "";
// Indicators of current state
let inAngle = false,
inComment = false,
needsSpace = false,
afterAddress = false;
let preserveSpace = false;
let commentClosed = false;
// RFC 5322 §3.4 notes that legacy implementations exist which use a simple
// recipient form where the addr-spec appears without the angle brackets,
// but includes the name of the recipient in parentheses as a comment
// following the addr-spec. While we do not create this format, we still
// want to recognize it, though.
// Furthermore, despite allowing comments in addresses, RFC 5322 §3.4 notes
// that legacy implementations may interpret the comment, and thus it
// recommends not to use them. (Also, they may be illegal as per RFC 5321.)
// While we do not create address fields with comments, we recognize such
// comments during parsing and (a) either drop them if inside addr-spec or
// (b) preserve them as part of the display-name if not.
// If the display-name is empty while the last comment is not, we assume it's
// the legacy form above and take the comment content as the display-name.
//
// When parsing the address field, we at first do not know whether any
// strings belong to the display-name (which may include comments) or to the
// local-part of an addr-spec (where we ignore comments) until we find an
// '@' or an '<' token. Thus, we collect both variants until the fog lifts,
// plus the last comment seen.
let lastComment = "";
/**
* Add the parsed mailbox object to the address list.
* If it's in the legacy form above, correct the display-name.
* Also reset any faked flags.
*
* @param {string} addrName - display-name as per RFC 5322
* @param {string} addrEmail - addr-spec as per RFC 5322
*/
function addToAddrList(addrName, addrEmail) {
// Keep the local-part quoted if it needs to be.
const lp = addrEmail.substring(0, addrEmail.lastIndexOf("@"));
if (/[ !()<>\[\]:;@\\,"]/.exec(lp) !== null) {
addrEmail =
'"' +
lp.replace(/([\\"])/g, "\\$1") +
'"' +
addrEmail.substring(addrEmail.lastIndexOf("@"));
}
// Replace all whitespace characters with a single whitespace,
// to avoid consecutive whitespace and also to normalize tabs and newlines.
addrName = addrName.replace(/\s+/g, " ").trim();
if (addrName === "" && lastComment !== "") {
// Take last comment content as the display-name.
const offset = lastComment[0] === " " ? 2 : 1;
addrName = lastComment.substr(offset, lastComment.length - offset - 1);
}
if (addrName !== "" || addrEmail !== "") {
addrlist.push({ name: addrName, email: addrEmail });
}
// Clear pending flags and variables.
displayName = localPart = address = lastComment = "";
inAngle = inComment = needsSpace = afterAddress = false;
}
// Main parsing loop
for (let token of getHeaderTokens(header, ":,;<>@", {
qstring: true,
comments: true,
dliteral: true,
rfc2047: doRFC2047,
})) {
if (token === ":") {
groupName = displayName;
displayName = "";
localPart = "";
// If we had prior email address results, commit them to the top-level.
if (addrlist.length > 0) {
results = results.concat(addrlist);
}
addrlist = [];
} else if (token === "<" && !afterAddress) {
if (inAngle) {
// Interpret the address we were parsing as a name.
if (address.length > 0) {
displayName = address;
}
localPart = address = "";
} else {
inAngle = true;
}
} else if (token === ">" && !afterAddress) {
inAngle = false;
// Forget addr-spec comments.
lastComment = "";
afterAddress = true;
} else if (token === "(") {
inComment = true;
// The needsSpace flag may not always be set even if it should be,
// e.g. for a comment behind an angle-addr.
// Also, we need to restore the needsSpace flag if we ignore the comment.
preserveSpace = needsSpace;
if (!needsSpace) {
needsSpace = displayName !== "" && displayName.substr(-1) !== " ";
}
comment = needsSpace ? " (" : "(";
commentClosed = false;
} else if (token === ")") {
inComment = false;
comment += ")";
lastComment = comment;
// The comment may be part of the name, but not of the local-part.
// Enforce a space behind the comment only when not ignoring it.
if (inAngle) {
needsSpace = preserveSpace;
} else {
displayName += comment;
needsSpace = true;
}
commentClosed = true;
continue;
} else if (token === "@") {
if (afterAddress) {
continue;
}
// An @ means we see an email address. If we're not within <> brackets,
// then we just parsed an email address instead of a display name. Empty
// out the display name for the current production.
if (!inAngle) {
address = localPart;
displayName = "";
localPart = "";
// The remainder of this mailbox is part of an addr-spec.
inAngle = true;
}
address += "@";
} else if (token === ",") {
// A comma ends the current name. If we have something that's kind of a
// name, add it to the result list. If we don't, then our input looks like
// To: , , -> don't bother adding an empty entry.
addToAddrList(displayName, address);
afterAddress = false;
} else if (token === ";") {
// Add pending name to the list
addToAddrList(displayName, address);
// If no group name was found, treat the ';' as a ','. In any case, we
// need to copy the results of addrlist into either a new group object or
// the main list.
if (groupName === "") {
results = results.concat(addrlist);
} else {
results.push({
name: groupName,
group: addrlist,
});
}
// ... and reset every other variable.
addrlist = [];
groupName = "";
} else {
// This is either comment content, a quoted-string, or some span of
// dots and atoms.
token = cleanToken(token.toString());
// Ignore the needs space if we're a "close" delimiter token.
let spacedToken = token;
if (needsSpace && token && token[0] != ".") {
spacedToken = " " + spacedToken;
}
// Which field do we add this data to?
if (inComment) {
comment += spacedToken;
} else if (inAngle) {
address += spacedToken;
} else {
if (!afterAddress) {
displayName += spacedToken;
}
// Never add a space to the local-part, if we just ignored a comment.
if (commentClosed) {
localPart += token;
commentClosed = false;
} else {
localPart += spacedToken;
}
}
// We need space for the next token if we aren't some kind of comment or
// . delimiter.
needsSpace = token && token[0] != ".";
// The fall-through case after this resets needsSpace to false, and we
// don't want that!
continue;
}
// If we just parsed a delimiter, we don't need any space for the next
// token.
needsSpace = false;
}
// If we're missing the final ';' of a group, assume it was present. Also, add
// in the details of any email/address that we previously saw.
addToAddrList(displayName, address);
if (groupName !== "") {
results.push({ name: groupName, group: addrlist });
addrlist = [];
}
// Add the current address list build-up to the list of addresses, and return
// the whole array to the caller.
return results.concat(addrlist);
}
/**
* Extract parameters from a header which is a series of ;-separated
* attribute=value tokens.
*
* @param {string} headerValue The MIME header value to parse.
* @param {boolean} doRFC2047 - If true, decode RFC 2047 encoded-words.
* @param {boolean} doRFC2231 - If true, decode RFC 2231 encoded parameters.
* @returns {Map<string,string>} A map of parameter names to parameter values.
* The property preSemi is set to the token that precedes the first semicolon.
*/
/* eslint-disable complexity */
function parseParameterHeader(headerValue, doRFC2047, doRFC2231) {
// The basic syntax of headerValue is token [; token = token-or-qstring]*
// Copying more or less liberally from nsMIMEHeaderParamImpl:
// The first token is the text to the first whitespace or semicolon.
var semi = headerValue.indexOf(";");
let start, rest;
if (semi < 0) {
start = headerValue;
rest = "";
} else {
start = headerValue.substring(0, semi);
rest = headerValue.substring(semi); // Include the semicolon
}
// Strip start to be <WSP><nowsp><WSP>.
start = start.trim().split(/[ \t\r\n]/)[0];
// Decode the the parameter tokens.
const opts = { qstring: true, rfc2047: doRFC2047 };
// tokenName is the name of the parameter,
let tokenName = "";
// inName is true iff we don't have a name yet.
let inName = true;
// Matches is a list of [name, value] pairs, where we found something that
// looks like name=value in the input string.
const matches = [];
for (let token of getHeaderTokens(rest, ";=", opts)) {
if (token === ";") {
// If we didn't find a name yet (we have ... tokenA; tokenB), push the
// name with an empty token instead.
if (tokenName != "" && !inName) {
matches.push([tokenName, ""]);
}
tokenName = "";
inName = true;
} else if (token === "=") {
inName = false;
} else if (inName && tokenName == "") {
tokenName = token.toString();
} else if (!inName && tokenName != "") {
token = token.toString();
// RFC 2231 doesn't make it clear if %-encoding is supposed to happen
// within a quoted string, but this is very much required in practice. If
// it ends with a '*', then the string is an extended-value, which means
// that its value may be %-encoded.
if (doRFC2231 && tokenName.endsWith("*")) {
token = token.replace(
/%([0-9A-Fa-f]{2})/g,
function (match, hexchars) {
return String.fromCharCode(parseInt(hexchars, 16));
}
);
}
matches.push([tokenName, token]);
// Clear the name, so we ignore anything afterwards.
tokenName = "";
} else if (inName) {
// We have ...; tokenA tokenB ... -> ignore both tokens
tokenName = ""; // Error recovery, ignore this one
}
}
// If we have a leftover ...; tokenA, push the tokenA
if (tokenName != "" && !inName) {
matches.push([tokenName, ""]);
}
// Now matches holds the parameters, so clean up for RFC 2231. There are three
// cases: param=val, param*=us-ascii'en-US'blah, and param*n= variants. The
// order of preference is to pick the middle, then the last, then the first.
// Note that we already unpacked %-encoded values.
// simpleValues is just a straight parameter -> value map.
// charsetValues is the parameter -> value map, although values are stored
// before charset decoding happens.
// continuationValues maps parameter -> array of values, with extra properties
// valid (if we decided we couldn't do anything anymore) and hasCharset (which
// records if we need to decode the charset parameter or not).
var simpleValues = new Map();
var charsetValues = new Map();
var continuationValues = new Map();
for (let [paramName, value] of matches) {
// Get first index, not last index, so we match param*0*= like param*0=.
const star = paramName.indexOf("*");
if (star == -1) {
// This is the case of param=val. Select the first value here, if there
// are multiple ones.
if (!simpleValues.has(paramName)) {
simpleValues.set(paramName, value);
}
} else if (star == paramName.length - 1) {
// This is the case of param*=us-ascii'en-US'blah.
paramName = paramName.substring(0, star);
// Again, select only the first value here.
if (!charsetValues.has(paramName)) {
charsetValues.set(paramName, value);
}
} else {
// This is the case of param*0= or param*0*=.
const param = paramName.substring(0, star);
let entry = continuationValues.get(param);
// Did we previously find this one to be bungled? Then ignore it.
if (continuationValues.has(param) && !entry.valid) {
continue;
}
// If we haven't seen it yet, set up entry already. Note that entries are
// not straight string values but rather [valid, hasCharset, param0, ... ]
if (!continuationValues.has(param)) {
entry = [];
entry.valid = true;
entry.hasCharset = undefined;
continuationValues.set(param, entry);
}
// When the string ends in *, we need to charset decoding.
// Note that the star is only meaningful for the *0*= case.
const lastStar = paramName[paramName.length - 1] == "*";
let number = paramName.substring(
star + 1,
paramName.length - (lastStar ? 1 : 0)
);
if (number == "0") {
entry.hasCharset = lastStar;
} else if (
number.length == 0 ||
(number[0] == "0" && number != "0") ||
!/^[0-9]+$/.test(number)
) {
// Is the continuation number illegal?
entry.valid = false;
continue;
}
// Normalize to an integer
number = parseInt(number, 10);
// Is this a repeat? If so, bail.
if (entry[number] !== undefined) {
entry.valid = false;
continue;
}
// Set the value for this continuation index. JS's magic array setter will
// expand the array if necessary.
entry[number] = value;
}
}
// Build the actual parameter array from the parsed values
// Simple values have lowest priority, so just add everything into the result
// now.
var values = new Map(simpleValues);
if (doRFC2231) {
// Continuation values come next
for (const [paramName, entry] of continuationValues) {
// If we never saw a param*0= or param*0*= value, then we can't do any
// reasoning about what it looks like, so bail out now.
if (entry.hasCharset === undefined) {
continue;
}
// Use as many entries in the array as are valid--if we are missing an
// entry, stop there.
let valid = true;
for (var i = 0; valid && i < entry.length; i++) {
if (entry[i] === undefined) {
valid = false;
}
}
// Concatenate as many parameters as are valid. If we need to decode thec
// charset, do so now.
let value = entry.slice(0, i).join("");
if (entry.hasCharset) {
try {
value = decode2231Value(value);
} catch (e) {
// Bad charset, don't add anything.
continue;
}
}
// Finally, add this to the output array.
values.set(paramName, value);
}
// Highest priority is the charset conversion.
for (const pair of charsetValues) {
try {
values.set(pair[0], decode2231Value(pair[1]));
} catch (e) {
// Bad charset, don't add anything.
}
}
}
for (const [key, value] of values.entries()) {
values.set(key, cleanToken(value));
}
// Finally, return the values computed above.
values.preSemi = start;
return values;
}
/* eslint-enable complexity */
/**
* Convert a RFC 2231-encoded string parameter into a Unicode version of the
* string. This assumes that percent-decoding has already been applied.
*
* @param {string} value The RFC 2231-encoded string to decode.
* @returns {string} the Unicode version of the string.
*/
function decode2231Value(value) {
const quote1 = value.indexOf("'");
const quote2 = quote1 >= 0 ? value.indexOf("'", quote1 + 1) : -1;
const charset = quote1 >= 0 ? value.substring(0, quote1) : "";
// It turns out that the language isn't useful anywhere in our codebase for
// the present time, so we will safely ignore it.
// var language = (quote2 >= 0 ? value.substring(quote1 + 2, quote2) : "");
value = value.substring(Math.max(quote1, quote2) + 1);
// Convert the value into a typed array for decoding
const typedarray = mimeutils.stringToTypedArray(value);
// Decode the charset. If the charset isn't found, we throw an error. Try to
// fallback in that case.
return new mimeutils.MimeTextDecoder(charset, { fatal: true }).decode(
typedarray,
{
stream: false,
}
);
}
// This is a map of known timezone abbreviations, for fallback in obsolete Date
// productions.
const KNOWN_TIMEZONES = {
// The following timezones are explicitly listed in RFC 5322.
UT: "+0000",
GMT: "+0000",
EST: "-0500",
EDT: "-0400",
CST: "-0600",
CDT: "-0500",
MST: "-0700",
MDT: "-0600",
PST: "-0800",
PDT: "-0700",
// The following are time zones copied from NSPR's prtime.c
AST: "-0400", // Atlantic Standard Time
NST: "-0330", // Newfoundland Standard Time
BST: "+0100", // British Summer Time
MET: "+0100", // Middle Europe Time
EET: "+0200", // Eastern Europe Time
JST: "+0900", // Japan Standard Time
};
/**
* Parse a header that contains a date-time definition according to RFC 5322.
* The result is a JS date object with the same timestamp as the header.
*
* The dates returned by this parser cannot be reliably converted back into the
* original header for two reasons. First, JS date objects cannot retain the
* timezone information they were initialized with, so reserializing a date
* header would necessarily produce a date in either the current timezone or in
* UTC. Second, JS dates measure time as seconds elapsed from the POSIX epoch
* excluding leap seconds. Any timestamp containing a leap second is instead
* converted into one that represents the next second.
*
* Dates that do not match the RFC 5322 production are instead attempted to
* parse using the Date.parse function. The strings that are accepted by
* Date.parse are not fully defined by the standard, but most implementations
* should accept strings that look rather close to RFC 5322 strings. Truly
* invalid dates produce a formulation that results in an invalid date,
* detectable by having its .getTime() method return NaN.
*
* @param {string} header The MIME header value to parse.
* @returns {Date} the date contained within the header, as described
* above.
*/
function parseDateHeader(header) {
let tokens = getHeaderTokens(header, ",:", {}).map(x => x.toString());
// What does a Date header look like? In practice, most date headers devolve
// into Date: [dow ,] dom mon year hh:mm:ss tzoff [(abbrev)], with the day of
// week mostly present and the timezone abbreviation mostly absent.
// First, ignore the day-of-the-week if present. This would be the first two
// tokens.
if (tokens.length > 1 && tokens[1] === ",") {
tokens = tokens.slice(2);
}
// If there are too few tokens, the date is obviously invalid.
if (tokens.length < 8) {
return new Date(NaN);
}
// Save off the numeric tokens
const day = parseInt(tokens[0]);
// month is tokens[1]
let year = parseInt(tokens[2]);
const hours = parseInt(tokens[3]);
// tokens[4] === ':'
const minutes = parseInt(tokens[5]);
// tokens[6] === ':'
const seconds = parseInt(tokens[7]);
// Compute the month. Check only the first three digits for equality; this
// allows us to accept, e.g., "January" in lieu of "Jan."
let month = mimeutils.MONTH_NAMES.indexOf(tokens[1].slice(0, 3));
// If the month name is not recognized, make the result illegal.
if (month < 0) {
month = NaN;
}
// Compute the full year if it's only 2 digits. RFC 5322 states that the
// cutoff is 50 instead of 70.
if (year < 100) {
year += year < 50 ? 2000 : 1900;
}
// Compute the timezone offset. If it's not in the form ±hhmm, convert it to
// that form.
let tzoffset = tokens[8];
if (tzoffset in KNOWN_TIMEZONES) {
tzoffset = KNOWN_TIMEZONES[tzoffset];
}
let decompose = /^([+-])(\d\d)(\d\d)$/.exec(tzoffset);
// Unknown? Make it +0000
if (decompose === null) {
decompose = ["+0000", "+", "00", "00"];
}
let tzOffsetInMin = parseInt(decompose[2]) * 60 + parseInt(decompose[3]);
if (decompose[1] == "-") {
tzOffsetInMin = -tzOffsetInMin;
}
// How do we make the date at this point? Well, the JS date's constructor
// builds the time in terms of the local timezone. To account for the offset
// properly, we need to build in UTC.
const finalDate = new Date(
Date.UTC(year, month, day, hours, minutes, seconds) -
tzOffsetInMin * 60 * 1000
);
// Suppose our header was mangled and we couldn't read it--some of the fields
// became undefined. In that case, the date would become invalid, and the
// indication that it is so is that the underlying number is a NaN. In that
// scenario, we could build attempt to use JS Date parsing as a last-ditch
// attempt. But it's not clear that such messages really exist in practice,
// and the valid formats for Date in ES6 are unspecified.
return finalDate;
}
// Structured header decoding support
// ----------------------------------
// Load the default structured decoders
var structuredDecoders = new Map();
var preferredSpellings = structuredHeaders.spellings;
var forbiddenHeaders = new Set();
for (const pair of structuredHeaders.decoders) {
addStructuredDecoder(pair[0], pair[1]);
forbiddenHeaders.add(pair[0].toLowerCase());
}
/**
* Use an already-registered structured decoder to parse the value of the header
* into a structured representation.
*
* As this method is designed to be used for the internal MIME Parser to convert
* the raw header values to well-structured values, value is intended to be an
* array consisting of all occurrences of the header in order. However, for ease
* of use by other callers, it can also be treated as a string.
*
* If the decoder for the header is not found, an exception will be thrown.
*
* A large set of headers have pre-defined structured decoders; these decoders
* cannot be overridden with addStructuredDecoder, as doing so could prevent the
* MIME or message parsers from working properly. The pre-defined structured
* headers break down into five clases of results, plus some ad-hoc
* representations. They are:
*
* Addressing headers (results are the same as parseAddressingHeader):
* - Approved
* - Bcc
* - Cc
* - Delivered-To
* - Disposition-Notification-To
* - From
* - Mail-Reply-To
* - Mail-Followup-To
* - Reply-To
* - Resent-Bcc
* - Resent-Cc
* - Resent-From
* - Resent-Reply-To
* - Resent-Sender
* - Resent-To
* - Return-Receipt-To
* - Sender
* - To
*
* Date headers (results are the same as parseDateHeader):
* - Date
* - Expires
* - Injection-Date
* - NNTP-Posting-Date
* - Resent-Date
*
* References headers (results are the same as parseReferencesHeader):
* - (TODO: Parsing support for these headers is currently unsupported)
*
* Message-ID headers (results are the first entry of the result of
* parseReferencesHeader):
* - (TODO: Parsing support for these headers is currently unsupported)
*
* Unstructured headers (results are merely decoded according to RFC 2047):
* - Comments
* - Content-Description
* - Keywords
* - Subject
*
* The ad-hoc headers and their resulting formats are as follows:
* Content-Type: returns a JS Map of parameter names (in lower case) to their
* values, along with the following extra properties defined on the map:
* - mediatype: the type to the left of '/' (e.g., 'text', 'message')
* - subtype: the type to the right of '/' (e.g., 'plain', 'rfc822')
* - type: the full typename (e.g., 'text/plain')
* RFC 2047 and RFC 2231 decoding is applied where appropriate. The values of
* the type, mediatype, and subtype attributes are all normalized to lower-case,
* as are the names of all parameters.
*
* Content-Transfer-Encoding: the first value is converted to lower-case.
*
* @param {string} header - The name of the header of the values.
* @param {string|string[]} value The value(s) of the headers, after charset
* conversion (if any) has been applied. If it is an array, the headers are
* listed in the order they appear in the message.
* @returns {object} a structured representation of the header values.
*/
function parseStructuredHeader(header, value) {
// Enforce that the parameter is an array. If it's a string, make it a
// 1-element array.
if (typeof value === "string" || value instanceof String) {
value = [value];
}
if (!Array.isArray(value)) {
throw new TypeError("Header value is not an array: " + value);
}
// Lookup the header in our decoders; if present, use that to decode the
// header.
const lowerHeader = header.toLowerCase();
if (structuredDecoders.has(lowerHeader)) {
return structuredDecoders.get(lowerHeader).call(headerparser, value);
}
// If not present, throw an exception.
throw new Error("Unknown structured header: " + header);
}
/**
* Add a custom structured MIME decoder to the set of known decoders. These
* decoders are used for {@link parseStructuredHeader} and similar functions to
* encode richer, more structured values instead of relying on string
* representations everywhere.
*
* Structured decoders are functions which take in a single parameter consisting
* of an array of the string values of the header, in order that they appear in
* the message. These headers have had the charset conversion (if necessary)
* applied to them already. The this parameter of the function is set to be the
* headerparser module.
*
* There is a large set of structured decoders built-in to the jsmime library
* already. As these headers are fundamental to the workings of jsmime,
* attempting to replace them with a custom version will instead produce an
* exception.
*
* @param {string} header - The header name (in any case) for which the
* decoder will be used.
* @param {function(string[]):object} decoder - The structured decoder
* function.
*/
function addStructuredDecoder(header, decoder) {
const lowerHeader = header.toLowerCase();
if (forbiddenHeaders.has(lowerHeader)) {
throw new Error("Cannot override header: " + header);
}
structuredDecoders.set(lowerHeader, decoder);
if (!preferredSpellings.has(lowerHeader)) {
preferredSpellings.set(lowerHeader, header);
}
}
headerparser.addStructuredDecoder = addStructuredDecoder;
headerparser.convert8BitHeader = convert8BitHeader;
headerparser.decodeRFC2047Words = decodeRFC2047Words;
headerparser.getHeaderTokens = getHeaderTokens;
headerparser.parseAddressingHeader = parseAddressingHeader;
headerparser.parseDateHeader = parseDateHeader;
headerparser.parseParameterHeader = parseParameterHeader;
headerparser.parseStructuredHeader = parseStructuredHeader;
return Object.freeze(headerparser);
};
const headerparser = JsMIMEheaderparser();
// JavaScript Raw MIME Parser
// --------------------------
/**
* The parser implemented in this file produces a MIME part tree for a given
* input message via a streaming callback interface. It does not, by itself,
* understand concepts like attachments (hence the term 'Raw'); the consumer
* must translate output into such a format.
*
* Charsets:
* The MIME specifications permit a single message to contain multiple charsets
* (or perhaps none) as raw octets. As JavaScript strings are implicitly
* implemented in UTF-16, it is possible that some engines will attempt to
* convert these strings using an incorrect charset or simply fail to convert
* them at all. This parser assumes that its input is in the form of a "binary
* string", a string that uses only the first 256 characters of Unicode to
* represent the individual octets. To verify that charsets are not getting
* mangled elsewhere in the pipeline, the auxiliary test file test/data/charsets
* can be used.
*
* This parser attempts to hide the charset details from clients as much as
* possible. The resulting values of structured headers are always converted
* into proper Unicode strings before being exposed to clients; getting at the
* raw binary string data can only be done via getRawHeader. The .charset
* parameter on header objects, if changed, changes the fallback charset used
* for headers. It is initialized to the presumed charset of the corresponding
* part, taking into account the charset and force-charset options of the
* parser. Body parts are only converted into Unicode strings if the strformat
* option is set to Unicode. Even then, only the bodies of parts with a media
* type of text are converted to Unicode strings using available charset data;
* other parts are retained as Uint8Array objects.
*
* Part numbering:
* Since the output is a streaming format, individual parts are identified by a
* numbering scheme. The intent of the numbering scheme for parts is to comply
* with the part numbers as dictated by RFC 3501 as much possible; however,
* that scheme does have several edge cases which would, if strictly followed,
* make it impossible to refer to certain parts of the message. In addition, we
* wish to make it possible to refer to parts which are not discoverable in the
* original MIME tree but are still viewable as parts. The part numbering
* scheme is as follows:
* - Individual sections of a multipart/* body are numbered in increasing order
* sequentially, starting from 1. Note that the prologue and the epilogue of
* a multipart/* body are not considered entities and are therefore not
* included in the part numbering scheme (there is no way to refer to them).
* - The numbers of multipart/* parts are separated by `.' characters.
* - The outermost message is referred to by use of the empty string.
* --> The following segments are not accounted for by IMAP part numbering. <--
* - The body of any message/rfc822 or similar part is distinguished from the
* message part as a whole by appending a `$' character. This does not apply
* to the outermost message/rfc822 envelope.
*/
const JsMIMEmimeparser = function () {
var spellings = structuredHeaders.spellings;
/**
* An object that represents the structured MIME headers for a message.
*
* This class is primarily used as the 'headers' parameter in the startPart
* callback on handlers for MimeParser. As such, it is designed to do the right
* thing in common cases as much as possible, with some advanced customization
* possible for clients that need such flexibility.
*
* In a nutshell, this class stores the raw headers as an internal Map. The
* structured headers are not computed until they are actually used, which means
* that potentially expensive structuring (e.g., doing manual DKIM validation)
* can be performed as a structured decoder without impeding performance for
* those who just want a few common headers.
*
* The outer API of this class is intended to be similar to a read-only Map
* object (complete with iterability support), with a few extra properties to
* represent things that are hard to determine properly from headers. The keys
* used are "preferred spellings" of the headers, although the get and has
* methods will accept header parameters of any case. Preferred spellings are
* derived from the name passed to addStructuredDecoder/addStructuredEncoder; if
* no structured decoder has been registered, then the name capitalizes the
* first letter of every word in the header name.
*
* Extra properties compared to a Map object are:
* - charset: This field represents the assumed charset of the associated MIME
* body. It is prefilled using a combination of the charset and force-charset
* options on the associated MimeParser instance as well as attempting to find
* a charset parameter in the Content-Type header.
*
* If the force-charset option is false, the charset is guessed first using
* the Content-Type header's charset parameter, falling back to the charset
* option if it is present. If the force-charset option is true, the charset
* is initially set to the charset option. This initial guessed value can be
* overridden at any time by simply setting the field on this object.
*
* The charset is better reflected as a parameter of the body rather than the
* headers; this is ultimately the charset parameter that will be used if a
* body part is being converted to a Unicode strformat. Headers are converted
* using headerparser.convert8BitHeader, and this field is used as the
* fallbackCharset parameter, which will always to attempt to decode as UTF-8
* first (in accordance with RFC 6532) and will refuse to decode as UTF-16 or
* UTF-32, as ASCII is not a subset of those charsets.
*
* - rawHeaderText: This read-only field contains the original header text from
* which headers were parsed, preserving case and whitespace (including
* alternate line endings instead of CRLF) exactly. If the header text begins
* with the mbox delimiter (i.e., a line that begins with "From "), then that
* is excluded from the rawHeaderText value and is not reflected anywhere in
* this object.
*
* - contentType: This field contains the structured representation of the
* Content-Type header, if it is present. If it is not present, it is set to
* the structured representation of the default Content-Type for a part (as
* this data is not easily guessed given only MIME tree events).
*
* The constructor for these objects is not externally exported, and thus they
* can only be created via MimeParser.
*
* @param {BinaryString} rawHeaderText - The contents of the MIME headers to
* be parsed.
* @param {object} options - Options for the header parser.
* @param {boolean} options.stripcontinuations - If true, elide CRLFs from the
* raw header output.
*/
function StructuredHeaders(rawHeaderText, options) {
// An individual header is terminated by a CRLF, except if the CRLF is
// followed by a SP or TAB. Use negative lookahead to capture the latter case,
// and don't capture the strings or else split results get nasty.
const values = rawHeaderText.split(/(?:\r\n|\n)(?![ \t])|\r(?![ \t\n])/);
// Ignore the first "header" if it begins with an mbox delimiter
if (values.length > 0 && values[0].substring(0, 5) == "From ") {
values.shift();
// Elide the mbox delimiter from this._headerData
if (values.length == 0) {
rawHeaderText = "";
} else {
rawHeaderText = rawHeaderText.substring(
rawHeaderText.indexOf(values[0])
);
}
}
const headers = new Map();
for (let i = 0; i < values.length; i++) {
// Look for a colon. If it's not present, this header line is malformed,
// perhaps by premature EOF or similar.
const colon = values[i].indexOf(":");
let header, val;
if (colon >= 0) {
header = values[i].substring(0, colon);
val = values[i].substring(colon + 1).trim();
if (options.stripcontinuations) {
val = val.replace(/[\r\n]/g, "");
}
} else {
header = values[i];
val = "";
}
// Canonicalize the header in lower-case form.
header = header.trim().toLowerCase();
// Omit "empty" headers
if (header == "") {
continue;
}
// We keep an array of values for each header, since a given header may be
// repeated multiple times.
if (headers.has(header)) {
headers.get(header).push(val);
} else {
headers.set(header, [val]);
}
}
/**
* A map of header names to arrays of raw values found in this header block
*/
this._rawHeaders = headers;
/**
* Cached results of structured header parsing.
*/
this._cachedHeaders = new Map();
Object.defineProperty(this, "rawHeaderText", {
get() {
return rawHeaderText;
},
});
Object.defineProperty(this, "size", {
get() {
return this._rawHeaders.size;
},
});
Object.defineProperty(this, "charset", {
get() {
return this._charset;
},
set(value) {
this._charset = value;
// Clear the cached headers, since this could change their values
this._cachedHeaders.clear();
},
});
// Default to the charset, until the message parser overrides us.
if ("charset" in options) {
this._charset = options.charset;
} else {
this._charset = null;
}
// If we have a Content-Type header, set contentType to return the structured
// representation. We don't set the value off the bat, since we want to let
// someone who changes the charset affect the values of 8-bit parameters.
Object.defineProperty(this, "contentType", {
configurable: true,
get() {
return this.get("Content-Type");
},
});
}
/**
* Get a raw header.
*
* Raw headers are an array of the header values, listed in order that they were
* specified in the header block, and without any attempt to convert charsets or
* apply RFC 2047 decoding. For example, in the following message (where the
* <XX> is meant to represent binary-octets):
*
* X-Header: Value A
* X-Header: V<C3><A5>lue B
* Header2: Q
*
* the result of calling getRawHeader('X-Header') or getRawHeader('x-header')
* would be ['Value A', 'V\xC3\xA5lue B'] and the result of
* getRawHeader('Header2') would be ['Q'].
*
* @param {string} headerName - The header name for which to get header values.
* @returns {BinaryString[]} the raw header values (with no charset conversion
* applied).
*/
StructuredHeaders.prototype.getRawHeader = function (headerName) {
return this._rawHeaders.get(headerName.toLowerCase());
};
/**
* Retrieve a structured version of the header.
*
* If there is a registered structured decoder (registration happens via
* headerparser.addStructuredDecoder), then the result of calling that decoder
* on the charset-corrected version of the header is returned. Otherwise, the
* values are charset-corrected and RFC 2047 decoding is applied as if the
* header were an unstructured header.
*
* A substantial set of headers have pre-registed structured decoders, which, in
* some cases, are unable to be overridden due to their importance in the
* functioning of the parser code itself.
*
* @param {string} headerName - The header name for which to get the header value.
* @returns {string} the structured header value of the output.
*/
StructuredHeaders.prototype.get = function (headerName) {
// Normalize the header name to lower case
headerName = headerName.toLowerCase();
// First, check the cache for the header value
if (this._cachedHeaders.has(headerName)) {
return this._cachedHeaders.get(headerName);
}
// Not cached? Grab it [propagating lack of header to caller]
let headerValue = this._rawHeaders.get(headerName);
if (headerValue === undefined) {
return headerValue;
}
// Convert the header to Unicode
const charset = this.charset;
headerValue = headerValue.map(function (value) {
return headerparser.convert8BitHeader(value, charset);
});
// If there is a structured decoder, use that; otherwise, assume that the
// header is unstructured and only do RFC 2047 conversion
let structured;
try {
structured = headerparser.parseStructuredHeader(headerName, headerValue);
} catch (e) {
structured = headerValue.map(function (value) {
return headerparser.decodeRFC2047Words(value);
});
}
// Cache the result and return it
this._cachedHeaders.set(headerName, structured);
return structured;
};
/**
* Check if the message has the given header.
*
* @param {string} headerName - The header name for which to get the header value.
* @returns {boolean} true if the header is present in this header block.
*/
StructuredHeaders.prototype.has = function (headerName) {
// Check for presence in the raw headers instead of cached headers.
return this._rawHeaders.has(headerName.toLowerCase());
};
// Make a custom iterator. Presently, support for Symbol isn't yet present in
// SpiderMonkey (or V8 for that matter), so type-pun the name for now.
var JS_HAS_SYMBOLS = typeof Symbol === "function";
var ITERATOR_SYMBOL = JS_HAS_SYMBOLS ? Symbol.iterator : "@@iterator";
/**
* An equivalent of Map.@@iterator, applied to the structured header
* representations. This is the function that makes
* for (let [header, value] of headers) work properly.
*/
StructuredHeaders.prototype[ITERATOR_SYMBOL] = function* () {
// Iterate over all the raw headers, and use the cached headers to retrieve
// them.
for (const headerName of this.keys()) {
yield [headerName, this.get(headerName)];
}
};
/**
* An equivalent of Map.forEach, applied to the structured header
* representations.
*
* @param {Function} callback - The (value, name, headers)
* callback to call for each header/value combo.
* @param {object} thisarg - The parameter that will be the |this| of the
* callback.
*/
StructuredHeaders.prototype.forEach = function (callback, thisarg) {
for (const [header, value] of this) {
callback.call(thisarg, value, header, this);
}
};
/**
* An equivalent of Map.entries, applied to the structured header
* representations.
*/
StructuredHeaders.prototype.entries =
StructuredHeaders.prototype[Symbol.iterator];
// This function maps lower case names to a pseudo-preferred spelling.
function capitalize(headerName) {
return headerName.replace(/\b[a-z]/g, function (match) {
return match.toUpperCase();
});
}
/**
* An equivalent of Map.keys, applied to the structured header representations.
*/
StructuredHeaders.prototype.keys = function* () {
for (const headerName of this._rawHeaders.keys()) {
yield spellings.get(headerName) || capitalize(headerName);
}
};
/**
* An equivalent of Map.values, applied to the structured header
* representations.
*/
StructuredHeaders.prototype.values = function* () {
for (const [, value] of this) {
yield value;
}
};
/**
* A MIME parser.
*
* The inputs to the constructor consist of a callback object which receives
* information about the output data and an optional object containing the
* settings for the parser.
*
* The first parameter, emitter, is an object which contains several callbacks.
* Note that any and all of these methods are optional; the parser will not
* crash if one is missing. The callbacks are as follows:
* startMessage()
* Called when the stream to be parsed has started delivering data. This
* will be called exactly once, before any other call.
* endMessage()
* Called after all data has been delivered and the message parsing has
* been completed. This will be called exactly once, after any other call.
* startPart(string partNum, object headers)
* Called after the headers for a body part (including the top-level
* message) have been parsed. The first parameter is the part number (see
* the discussion on part numbering). The second parameter is an instance
* of StructuredHeaders that represents all of the headers for the part.
* endPart(string partNum)
* Called after all of the data for a body part (including sub-parts) has
* been parsed. The first parameter is the part number.
* deliverPartData(string partNum, {string,typedarray} data)
* Called when some data for a body part has been delivered. The first
* parameter is the part number. The second parameter is the data which is
* being delivered; the exact type of this data depends on the options
* used. Note that data is only delivered for leaf body parts.
*
* The second parameter, options, is an optional object containing the options
* for the parser. The following are the options that the parser may use:
* pruneat: <string> [default=""]
* Treat the message as starting at the given part number, so that no parts
* above <string> are returned.
* bodyformat: one of {none, raw, nodecode, decode} [default=nodecode]
* How to return the bodies of parts:
* none: no part data is returned
* raw: the body of the part is passed through raw
* nodecode: the body is passed through without decoding QP/Base64
* decode: quoted-printable and base64 are fully decoded
* strformat: one of {binarystring, unicode, typedarray} [default=binarystring]
* How to treat output strings:
* binarystring: Data is a JS string with chars in the range [\x00-\xff]
* unicode: Data for text parts is converted to UTF-16; data for other
* parts is a typed array buffer, akin to typedarray.
* typedarray: Data is a JS typed array buffer
* charset: <string> [default=""]
* What charset to assume if no charset information is explicitly provided.
* This only matters if strformat is unicode. See above note on charsets
* for more details.
* force-charset: <boolean> [default=false]
* If true, this coerces all types to use the charset option, even if the
* message specifies a different content-type.
* stripcontinuations: <boolean> [default=true]
* If true, then the newlines in headers are removed in the returned
* header objects.
* onerror: <function(thrown error)> [default = nop-function]
* An error function that is called if an emitter callback throws an error.
* By default, such errors are swallowed by the parser. If you want the
* parser itself to throw an error, rethrow it via the onerror function.
* decodeSubMessages: <boolean> [default=true]
* Parse attached messages (message/rfc822, message/global & message/news)
* and return all of their mime data instead of returning their content
* as regular attachments.
*/
function MimeParser(emitter, options) {
// The actual emitter
this._emitter = emitter;
// Options for the parser (those listed here are defaults)
this._options = {
decodeSubMessages: true,
pruneat: "",
bodyformat: "nodecode",
strformat: "binarystring",
stripcontinuations: true,
charset: "",
"force-charset": false,
onerror() {},
};
// Load the options as a copy here (prevents people from changing on the fly).
if (options) {
for (var opt in options) {
this._options[opt] = options[opt];
}
}
// Ensure that the error function is in fact a function
if (typeof this._options.onerror != "function") {
throw new Error("onerror callback must be a function");
}
// Reset the parser
this.resetParser();
}
/**
* Resets the parser to read a new message. This method need not be called
* immediately after construction.
*/
MimeParser.prototype.resetParser = function () {
// Current parser state
this._state = PARSING_HEADERS;
// Input data that needs to be held for buffer conditioning
this._holdData = "";
// Complete collection of headers (also used to accumulate _headerData)
this._headerData = "";
// Whether or not emitter.startMessage has been called
this._triggeredCall = false;
// Splitting input
this._splitRegex = this._handleSplit = undefined;
// Subparsing
this._subparser = this._subPartNum = undefined;
// Data that has yet to be consumed by _convertData
this._savedBuffer = "";
// Convert data
this._convertData = undefined;
// String decoder
this._decoder = undefined;
};
/**
* Deliver a buffer of data to the parser.
*
* @param {BinaryString} buffer - The raw data to add to the message.
*/
MimeParser.prototype.deliverData = function (buffer) {
// In ideal circumstances, we'd like to parse the message all at once. In
// reality, though, data will be coming to us in packets. To keep the amount
// of saved state low, we want to make basic guarantees about how packets get
// delivered. Our basic model is a twist on line-buffering, as the format of
// MIME and messages make it hard to not do so: we can handle multiple lines
// at once. To ensure this, we start by conditioning the packet by
// withholding data to make sure that the internal deliveries have the
// guarantees. This implies that we need to do the following steps:
// 1. We don't know if a `\r' comes from `\r\n' or the old mac line ending
// until we see the next character. So withhold the last `\r'.
// 2. Ensure that every packet ends on a newline. So scan for the end of the
// line and withhold until the \r\n comes through.
// [Note that this means that an input message that uses \r line endings and
// is being passed to us via a line-buffered input is going to have most of
// its data being withhold until the next buffer. Since \r is so uncommon of
// a line ending in modern times, this is acceptable lossage.]
// 3. Eliminate empty packets.
// Add in previously saved data
if (this._holdData) {
buffer = this._holdData + buffer;
this._holdData = "";
}
// Condition the input, so that we get the multiline-buffering mentioned in
// the above comment.
if (buffer.length > 0) {
[buffer, this._holdData] = conditionToEndOnCRLF(buffer);
}
// Ignore 0-length buffers.
if (buffer.length == 0) {
return;
}
// Signal the beginning, if we haven't done so.
if (!this._triggeredCall) {
this._callEmitter("startMessage");
this._triggeredCall = true;
}
// Finally, send it the internal parser.
this._dispatchData("", buffer, true);
};
/**
* Ensure that a set of data always ends in an end-of-line character.
*
* @param {BinaryString} buffer - The data with no guarantees about where it ends.
* @returns {BinaryString[]} An array of 2 binary strings where the first string
* ends in a newline and the last string contains the text in buffer
* following the first string.
*/
function conditionToEndOnCRLF(buffer) {
// Find the last occurrence of '\r' or '\n' to split the string. However, we
// don't want to consider '\r' if it is the very last character, as we need
// the next packet to tell if the '\r' is the beginning of a CRLF or a line
// ending by itself.
const lastCR = buffer.lastIndexOf("\r", buffer.length - 2);
const lastLF = buffer.lastIndexOf("\n");
const end = lastLF > lastCR ? lastLF : lastCR;
return [buffer.substring(0, end + 1), buffer.substring(end + 1)];
}
/**
* Tell the parser that all of the data has been delivered.
*
* This will flush all of the internal state of the parser.
*/
MimeParser.prototype.deliverEOF = function () {
// Start of input buffered too long? Call start message now.
if (!this._triggeredCall) {
this._triggeredCall = true;
this._callEmitter("startMessage");
}
// Force a flush of all of the data.
if (this._holdData) {
this._dispatchData("", this._holdData, true);
}
this._dispatchEOF("");
// Signal to the emitter that we're done.
this._callEmitter("endMessage");
};
/**
* Calls a method on the emitter safely.
*
* This method ensures that errors in the emitter call won't cause the parser
* to exit with an error, unless the user wants it to.
*
* @param {string} funcname - The function name to call on the emitter.
* @param {...?} args - Extra arguments to pass into the emitter callback.
*/
MimeParser.prototype._callEmitter = function (funcname, ...args) {
if (this._emitter && funcname in this._emitter) {
if (args.length > 0 && this._willIgnorePart(args[0])) {
// partNum is always the first argument, so check to make sure that it
// satisfies our emitter's pruneat requirement.
return;
}
try {
this._emitter[funcname].apply(this._emitter, args);
} catch (e) {
// We ensure that the onerror attribute in options is a function, so this
// is always safe.
this._options.onerror(e);
}
}
};
/**
* Helper function to decide if a part's output will never be seen.
*
* @param {string} part - The number of the part.
* @returns {boolean} true if the emitter is not interested in this part.
*/
MimeParser.prototype._willIgnorePart = function (part) {
if (this._options.pruneat) {
const match = this._options.pruneat;
const start = part.substr(0, match.length);
// It needs to start with and follow with a new part indicator
// (i.e., don't let 10 match with 1, but let 1.1 or 1$ do so)
if (
start != match ||
(match.length < part.length && !"$.".includes(part[match.length]))
) {
return true;
}
}
return false;
};
// MIME parser core
// ----------------
// This MIME parser is a stateful parser; handling of the MIME tree is mostly
// done by creating new parsers and feeding data to them manually. In parallel
// to the externally-visible deliverData and deliverEOF, the two methods
// _dispatchData and _dispatchEOF are the internal counterparts that do the
// main work of moving data to where it needs to go; helper functions are used
// to handle translation.
//
// The overall flow of the parser is this. First, it buffers all of the data
// until the dual-CRLF pattern is noticed. Once that is found, it parses the
// entire header chunk at once. As a result of header parsing, the parser enters
// one of three modes for handling data, and uses a special regex to change
// modes and handle state changes. Specific details about the states the parser
// can be in are as follows:
// PARSING_HEADERS: The input buffer is concatenated to the currently-received
// text, which is then searched for the CRLFCRLF pattern. If found, the data
// is split at this boundary; the first chunk is parsed using _parseHeaders,
// and the second chunk will fall through to buffer processing. After
// splitting, the headers are deliverd via the emitter, and _startBody is
// called to set up state for the parser.
// SEND_TO_BLACK_HOLE: All data in the input is ignored.
// SEND_TO_EMITTER: All data is passed into the emitter, if it is desired.
// Data can be optionally converted with this._convertData.
// SEND_TO_SUBPARSER: All data is passed into the subparser's _dispatchData
// method, using _subPartNum as the part number and _subparser as the object
// to call. Data can be optionally converted first with this._convertData.
//
// Additional state modifications can be done using a regex in _splitRegex and
// the callback method this._handleSplit(partNum, regexResult). The _handleSplit
// callback is free to do any modification to the current parser, including
// modifying the _splitRegex value. Packet conditioning guarantees that every
// buffer string passed into _dispatchData will have started immediately after a
// newline character in the fully assembled message.
//
// The this._convertData method, if present, is expected to return an array of
// two values, [{typedarray, string} decoded_buffer, string unused_buffer], and
// has as its arguments (string buffer, bool moreToCome).
//
// The header parsing by itself does very little parsing, only parsing as if all
// headers were unstructured fields. Values are munged so that embedded newlines
// are stripped and the result is also trimmed. Headers themselves are
// canonicalized into lower-case.
// Parser states. See the large comment above.
var PARSING_HEADERS = 1;
var SEND_TO_BLACK_HOLE = 2;
var SEND_TO_EMITTER = 3;
var SEND_TO_SUBPARSER = 4;
/**
* Main dispatch for incoming packet data.
*
* The incoming data needs to have been sanitized so that each packet begins on
* a newline boundary. The part number for the current parser also needs to be
* passed in. The checkSplit parameter controls whether or not the data in
* buffer needs to be checked against _splitRegex; this is used internally for
* the mechanics of splitting and should otherwise always be true.
*
* @param {string} partNum - The part number being currently parsed.
* @param {BinaryString} buffer - The text (conditioned as mentioned above) to
* pass to the parser.
* @param {boolean} checkSplit - If true, split the text using _splitRegex.
* This is set to false internally to handle low-level splitting details.
*/
MimeParser.prototype._dispatchData = function (partNum, buffer, checkSplit) {
// Are we parsing headers?
if (this._state == PARSING_HEADERS) {
this._headerData += buffer;
// Find the end of the headers--either it's a CRLF at the beginning (in
// which case we have no headers), or it's a pair of CRLFs.
const result = /(?:^(?:\r\n|[\r\n]))|(\r\n|[\r\n])\1/.exec(
this._headerData
);
if (result != null) {
// If we found the end of headers, split the data at this point and send
// the stuff after the double-CRLF into the later body parsing.
const headers = this._headerData.substr(0, result.index);
buffer = this._headerData.substring(result.index + result[0].length);
this._headerData = headers;
this._headers = this._parseHeaders();
this._callEmitter("startPart", partNum, this._headers);
this._startBody(partNum);
} else {
return;
}
}
// We're in the middle of the body. Start by testing the split regex, to see
// if there are many things that need to be done.
if (checkSplit && this._splitRegex) {
const splitResult = this._splitRegex.exec(buffer);
if (splitResult) {
// Pass the text before the split through the current state.
const start = splitResult.index,
len = splitResult[0].length;
if (start > 0) {
this._dispatchData(partNum, buffer.substr(0, start), false);
}
// Tell the handler that we've seen the split. Note that this can change
// any method on `this'.
this._handleSplit(partNum, splitResult);
// Send the rest of the data to where it needs to go. There could be more
// splits in the data, so watch out!
buffer = buffer.substring(start + len);
if (buffer.length > 0) {
this._dispatchData(partNum, buffer, true);
}
return;
}
}
// Where does the data go?
if (this._state == SEND_TO_BLACK_HOLE) {
// Don't send any data when going to the black hole.
} else if (this._state == SEND_TO_EMITTER) {
// Don't pass body data if the format is to be none
const passData = this._options.bodyformat != "none";
if (!passData || this._willIgnorePart(partNum)) {
return;
}
buffer = this._applyDataConversion(buffer, this._options.strformat);
if (buffer.length > 0) {
this._callEmitter("deliverPartData", partNum, buffer);
}
} else if (this._state == SEND_TO_SUBPARSER) {
buffer = this._applyDataConversion(buffer, "binarystring");
if (buffer.length > 0) {
this._subparser._dispatchData(this._subPartNum, buffer, true);
}
}
};
/**
* Output data using the desired output format, saving data if data conversion
* needs extra data to be saved.
*
* @param {BinaryString} buf - The data to be sent to the output.
* @param {string} type - The type of the data to output. Valid values are
* the same as the strformat option.
* @returns {BinaryString|string|Uint8Array} coerced and converted data that
* can be sent to the emitter or subparser.
*/
MimeParser.prototype._applyDataConversion = function (buf, type) {
// If we need to convert data, do so.
if (this._convertData) {
// Prepend leftover data from the last conversion.
buf = this._savedBuffer + buf;
[buf, this._savedBuffer] = this._convertData(buf, true);
}
return this._coerceData(buf, type, false);
};
/**
* Coerce the input buffer into the given output type.
*
* @param {BinaryString|Uint8Array} buffer - The data to be converted.
* @param {string} type - The type to convert the data to.
* @param {boolean} more - If true, this function will never be called again.
* @returns {BinaryString|string|Uint8Array} the desired output format.
*/
// Coerces the buffer (a string or typedarray) into a given type
MimeParser.prototype._coerceData = function (buffer, type, more) {
if (typeof buffer == "string") {
// string -> binarystring is a nop
if (type == "binarystring") {
return buffer;
}
// Either we're going to array or unicode. Both people need the array
var typedarray = mimeutils.stringToTypedArray(buffer);
// If it's unicode, do the coercion from the array
// If its typedarray, just return the synthesized one
return type == "unicode"
? this._coerceData(typedarray, "unicode", more)
: typedarray;
} else if (type == "binarystring") {
// Doing array -> binarystring
return mimeutils.typedArrayToString(buffer);
} else if (type == "unicode") {
// Doing array-> unicode: Use the decoder set up earlier to convert
if (this._decoder) {
return this._decoder.decode(buffer, { stream: more });
}
// If there is no charset, just return the typed array instead.
return buffer;
}
throw new Error("Invalid type: " + type);
};
/**
* Signal that no more data will be dispatched to this parser.
*
* @param {string} partNum - The part number being currently parsed.
*/
MimeParser.prototype._dispatchEOF = function (partNum) {
if (this._state == PARSING_HEADERS) {
// Unexpected EOF in headers. Parse them now and call startPart/endPart
this._headers = this._parseHeaders();
this._callEmitter("startPart", partNum, this._headers);
} else if (this._state == SEND_TO_SUBPARSER) {
// Pass in any lingering data
if (this._convertData && this._savedBuffer) {
this._subparser._dispatchData(
this._subPartNum,
this._convertData(this._savedBuffer, false)[0],
true
);
}
this._subparser._dispatchEOF(this._subPartNum);
// Clean up after ourselves
this._subparser = null;
} else if (this._convertData && this._savedBuffer) {
// Convert lingering data
let [buffer] = this._convertData(this._savedBuffer, false);
buffer = this._coerceData(buffer, this._options.strformat, false);
if (buffer.length > 0) {
this._callEmitter("deliverPartData", partNum, buffer);
}
}
// We've reached EOF for this part; tell the emitter
this._callEmitter("endPart", partNum);
};
/**
* Produce a dictionary of all headers as if they were unstructured fields.
*
* @returns {StructuredHeaders} The structured header objects for the header
* block.
*/
MimeParser.prototype._parseHeaders = function () {
const headers = new StructuredHeaders(this._headerData, this._options);
// Fill the headers.contentType parameter of headers.
let contentType = headers.get("Content-Type");
if (typeof contentType === "undefined") {
contentType = headerparser.parseStructuredHeader(
"Content-Type",
this._defaultContentType || "text/plain"
);
Object.defineProperty(headers, "contentType", {
get() {
return contentType;
},
});
} else {
Object.defineProperty(headers, "contentType", { configurable: false });
}
// Find the charset for the current part. If the user requested a forced
// conversion, use that first. Otherwise, check the content-type for one and
// fallback to a default if it is not present.
let charset = "";
if (this._options["force-charset"]) {
charset = this._options.charset;
} else if (contentType.has("charset")) {
charset = contentType.get("charset");
} else {
charset = this._options.charset;
}
headers.charset = charset;
// Retain a copy of the charset so that users don't override our decision for
// decoding body parts.
this._charset = charset;
return headers;
};
/**
* Initialize the parser state for the body of this message.
*
* @param {string} partNum - The part number being currently parsed.
*/
MimeParser.prototype._startBody = function (partNum) {
const contentType = this._headers.contentType;
// Should the bodyformat be raw, we just want to pass through all data without
// trying to interpret it.
if (this._options.bodyformat == "raw" && partNum == this._options.pruneat) {
this._state = SEND_TO_EMITTER;
return;
}
// The output depents on the content-type. Basic rule of thumb:
// 1. Discrete media types (text, video, audio, image, application) are passed
// through with no alterations beyond Content-Transfer-Encoding unpacking.
// 2. Everything with a media type of multipart is treated the same.
// 3. Any message/* type that acts like a mail message (rfc822, news, global)
// is parsed as a header/body pair again. Most of the other message/* types
// have similar structures, but they don't have cascading child subparts,
// so it's better to pass their entire contents to the emitter and let the
// consumer deal with them.
// 4. For untyped data, there needs to be no Content-Type header. This helps
// avoid false positives.
// If pruneat is set but also this._options.decodeSubMessages == false, we
// might never reach the requested part. Enforce parsing of sub messages, if
// the requested part is in that message.
let decodeSubMessages = this._options.decodeSubMessages;
if (!decodeSubMessages && this._options.pruneat) {
decodeSubMessages =
this._options.pruneat.length > partNum.length &&
this._options.pruneat.startsWith(partNum);
}
if (contentType.mediatype == "multipart") {
// If there's no boundary type, everything will be part of the prologue of
// the multipart message, so just feed everything into a black hole.
if (!contentType.has("boundary")) {
this._state = SEND_TO_BLACK_HOLE;
return;
}
// The boundary of a multipart message needs to start with -- and be at the
// beginning of the line. If -- is after the boundary, it represents the
// terminator of the multipart. After the line, there may be only whitespace
// and then the CRLF at the end. Since the CRLFs in here are necessary for
// distinguishing the parts, they are not included in the subparts, so we
// need to capture them in the regex as well to prevent them leaking out.
this._splitRegex = new RegExp(
"(\r\n|[\r\n]|^)--" +
contentType.get("boundary").replace(/[\\^$*+?.()|{}[\]]/g, "\\$&") +
"(--)?[ \t]*(?:\r\n|[\r\n]|$)"
);
this._handleSplit = this._whenMultipart;
this._subparser = new MimeParser(this._emitter, this._options);
// multipart/digest defaults to message/rfc822 instead of text/plain
if (contentType.subtype == "digest") {
this._subparser._defaultContentType = "message/rfc822";
}
// All text before the first boundary and after the closing boundary are
// supposed to be ignored ("must be ignored", according to RFC 2046 §5.1.1);
// in accordance with these wishes, ensure they don't get passed to any
// deliverPartData.
this._state = SEND_TO_BLACK_HOLE;
// Multipart MIME messages stipulate that the final CRLF before the boundary
// delimiter is not matched. When the packet ends on a CRLF, we don't know
// if the next text could be the boundary. Therefore, we need to withhold
// the last line of text to be sure of what's going on. The _convertData is
// how we do this, even though we're not really converting any data.
this._convertData = function (buffer, more) {
let splitPoint = buffer.length;
if (more) {
if (buffer.charAt(splitPoint - 1) == "\n") {
splitPoint--;
}
if (splitPoint >= 0 && buffer.charAt(splitPoint - 1) == "\r") {
splitPoint--;
}
}
const res = conditionToEndOnCRLF(buffer.substring(0, splitPoint));
const preLF = res[0];
const rest = res[1];
return [preLF, rest + buffer.substring(splitPoint)];
};
} else if (
(decodeSubMessages || this._willIgnorePart(partNum)) &&
(contentType.type == "message/rfc822" ||
contentType.type == "message/global" ||
contentType.type == "message/news")
) {
// The subpart is just another header/body pair that goes to EOF, so just
// return the parse from that blob
this._state = SEND_TO_SUBPARSER;
this._subPartNum = partNum + "$";
this._subparser = new MimeParser(this._emitter, this._options);
// So, RFC 6532 happily allows message/global types to have CTE applied.
// This means that subparts would need to be decoded to determine their
// contents properly. There seems to be some evidence that message/rfc822
// that is illegally-encoded exists in the wild, so be lenient and decode
// for any message/* type that gets here.
const cte = this._extractHeader("content-transfer-encoding", "");
if (cte in ContentDecoders) {
this._convertData = ContentDecoders[cte];
}
} else {
// Okay, we just have to feed the data into the output
this._state = SEND_TO_EMITTER;
if (this._options.bodyformat == "decode") {
// If we wish to decode, look it up in one of our decoders.
const cte = this._extractHeader("content-transfer-encoding", "");
if (cte in ContentDecoders) {
this._convertData = ContentDecoders[cte];
}
}
}
// Set up the encoder for charset conversions; only do this for text parts.
// Other parts are almost certainly binary, so no translation should be
// applied to them.
if (
this._options.strformat == "unicode" &&
contentType.mediatype == "text"
) {
// If the charset is nonempty, initialize the decoder
this._decoder = null;
if (this._charset !== "") {
try {
this._decoder = new mimeutils.MimeTextDecoder(this._charset);
} catch (e) {}
}
if (!this._decoder) {
// There's no charset we can use for decoding, so pass through as an
// identity encoder or otherwise this._coerceData will complain.
this._decoder = {
decode(buffer) {
return MimeParser.prototype._coerceData(
buffer,
"binarystring",
true
);
},
};
}
} else {
this._decoder = null;
}
};
// Internal split handling for multipart messages.
/**
* When a multipary boundary is found, handle the process of managing the
* subparser state. This is meant to be used as a value for this._handleSplit.
*
* @param {string} partNum - The part number being currently parsed.
* @param {Array} lastResult - The result of the regular expression match.
*/
MimeParser.prototype._whenMultipart = function (partNum, lastResult) {
// Fix up the part number (don't do '' -> '.4' and don't do '1' -> '14')
if (partNum != "") {
partNum += ".";
}
if (!this._subPartNum) {
// No count? This means that this is the first time we've seen the boundary,
// so do some initialization for later here.
this._count = 1;
} else {
// If we did not match a CRLF at the beginning of the line, strip CRLF from
// the saved buffer. We do this in the else block because it is not
// necessary for the prologue, since that gets ignored anyways.
if (this._savedBuffer != "" && lastResult[1] === "") {
let useEnd = this._savedBuffer.length - 1;
if (this._savedBuffer[useEnd] == "\n") {
useEnd--;
}
if (useEnd >= 0 && this._savedBuffer[useEnd] == "\r") {
useEnd--;
}
this._savedBuffer = this._savedBuffer.substring(0, useEnd + 1);
}
// If we have saved data and we matched a CRLF, pass the saved data in.
if (this._savedBuffer != "") {
this._subparser._dispatchData(
this._subPartNum,
this._savedBuffer,
true
);
}
// We've seen the boundary at least once before, so this must end a subpart.
// Tell that subpart that it has reached EOF.
this._subparser._dispatchEOF(this._subPartNum);
}
this._savedBuffer = "";
// The regex feeder has a capture on the (--)?, so if its result is present,
// then we have seen the terminator. Alternatively, the message may have been
// mangled to exclude the terminator, so also check if EOF has occurred.
if (lastResult[2] == undefined) {
this._subparser.resetParser();
this._state = SEND_TO_SUBPARSER;
this._subPartNum = partNum + this._count;
this._count += 1;
} else {
// Ignore the epilogue
this._splitRegex = null;
this._state = SEND_TO_BLACK_HOLE;
}
};
/**
* Return the structured header from the current header block, or a default if
* it is not present.
*
* @param {string} headerName - The header name to get.
* @param {string} dflt - The default MIME value of the header.
* @returns {StructuredHeaders} the structured representation of the header.
*/
MimeParser.prototype._extractHeader = function (headerName, dflt) {
headerName = headerName.toLowerCase(); // Normalize name
return this._headers.has(headerName)
? this._headers.get(headerName)
: headerparser.parseStructuredHeader(headerName, [dflt]);
};
var ContentDecoders = {};
ContentDecoders["quoted-printable"] = mimeutils.decode_qp;
ContentDecoders.base64 = mimeutils.decode_base64;
return MimeParser;
};
const MimeParser = JsMIMEmimeparser();
/**
* Implements the code for emitting structured representations of
* MIME headers into their encoded forms. The code here is a companion to,
* but completely independent of, headerparser: the structured
* representations that are used as input to the functions here are the
* same forms that would be parsed.
*/
const JsMIMEheaderemitter = function () {
// Get the default structured encoders and add them to the map
var encoders = new Map();
var preferredSpellings = structuredHeaders.spellings;
for (const [header, encoder] of structuredHeaders.encoders) {
addStructuredEncoder(header, encoder);
}
// Clamp a value in the range [min, max], defaulting to def
// if the object[property] does not contain the value.
function clamp(object, property, min, max, def) {
if (!(property in object)) {
return def;
}
const value = object[property];
if (value < min) {
return min;
}
if (value > max) {
return max;
}
return value;
}
/**
* An object that can assemble structured header representations into their MIME
* representation.
*
* The character-counting portion of this class operates using individual JS
* characters as its representation of logical character, which is not the same
* as the number of octets used as UTF-8. If non-ASCII characters are to be
* included in headers without some form of encoding, then care should be taken
* to set the maximum line length to account for the mismatch between character
* counts and octet counts: the maximum line is 998 octets, which could be as
* few as 332 JS characters (non-BMP characters, although they take up 4 octets
* in UTF-8, count as 2 in JS strings).
*
* This code takes care to only insert line breaks at the higher-level breaking
* points in a header (as recommended by RFC 5322), but it may need to resort to
* including them more aggressively if this is not possible. If even aggressive
* line-breaking cannot allow a header to be emitted without violating line
* length restrictions, the methods will throw an exception to indicate this
* situation.
*
* In general, this code does not attempt to modify its input; for example, it
* does not attempt to change the case of any input characters, apply any
* Unicode normalization algorithms, or convert email addresses to ACE where
* applicable. The biggest exception to this rule is that most whitespace is
* collapsed to a single space, even in unstructured headers, while most leading
* and trailing whitespace is trimmed from inputs.
*
* @param {StreamHandler} handler - The handler to which all output is sent.
* @param {Function} handler.deliverData - Receives encoded data. Takes string input.
* @param {Function} handler.deliverEOF - Sent when all text is sent.
* @param {object} options - Options for the emitter.
* @param {integer} [options.softMargin=78] - 30 <= softMargin <= 900
* The ideal maximum number of logical characters to include in a line, not
* including the final CRLF pair. Lines may exceed this margin if parameters
* are excessively long.
* @param {integer} [options.hardMargin=332] - softMargin <= hardMargin <= 998
* The maximum number of logical characters that can be included in a line,
* not including the final CRLF pair. If this count would be exceeded, then
* an error will be thrown and encoding will not be possible.
* @param {boolean} [options.useASCII=true]
* If true, then RFC 2047 and RFC 2231 encoding of headers will be performed
* as needed to retain headers as ASCII.
*/
function HeaderEmitter(handler, options) {
// The inferred value of options.useASCII
this._useASCII = options.useASCII === undefined ? true : options.useASCII;
this._sanitizeDate =
options.sanitizeDate === undefined ? false : options.sanitizeDate;
// The handler to use.
this._handler = handler;
/**
* The current line being built; note that we may insert a line break in the
* middle to keep under the maximum line length.
*
* @type {string}
*/
this._currentLine = "";
// Our bounds for soft and margins are not completely arbitrary. The minimum
// amount we need to encode is 20 characters, which can encode a single
// non-BMP character with RFC 2047. The value of 30 is chosen to give some
// breathing room for delimiters or other unbreakable characters. The maximum
// length is 998 octets, per RFC 5322; soft margins are slightly lower to
// allow for breathing room as well. The default of 78 for the soft margin is
// recommended by RFC 5322.
this._softMargin = clamp(options, "softMargin", 30, 900, 78);
this._hardMargin = clamp(options, "hardMargin", this._softMargin, 998, 998);
/**
* The index of the last preferred breakable position in the current line.
*
* @type {integer}
*/
this._preferredBreakpoint = 0;
}
// Low-level methods
// -----------------
// Explanation of the emitter internals:
// RFC 5322 requires that we wrap our lines, ideally at 78 characters and at
// least by 998 octets. We can't wrap in arbitrary places, but wherever CFWS is
// valid... and ideally wherever clients are likely to expect it. In theory, we
// can break between every token (this is how RFC 822 operates), but, in RFC
// 5322, many of those breaks are relegated to obsolete productions, mostly
// because it is common to not properly handle breaks in those locations.
//
// So how do we do line breaking? The algorithm we implement is greedy, to
// simplify implementation. There are two margins: the soft margin, which we
// want to keep within, and the hard margin, which we absolutely have to keep
// within. There are also two kinds of break points: preferred and emergency.
// As long as we keep the line within the hard margin, we will only break at
// preferred breakpoints; emergency breakpoints are only used if we would
// otherwise exceed the hard margin.
//
// For illustration, here is an example header and where these break points are
// located:
//
// To: John "The Rock" Smith <jsmith@a.long.domain.invalid>
// Preferred: ^ ^ ^
// Emergency: ^ ^ ^ ^^ ^ ^ ^ ^ ^
//
// Preferred breakpoints are indicated by setting the mayBreakAfter parameter of
// addText to true, while emergency breakpoints are set after every token passed
// into addText. This is handled implicitly by only adding text to _currentLine
// if it ends in an emergency breakpoint.
//
// Internally, the code keeps track of margins by use of two variables. The
// _softMargin and _hardMargin variables encode the positions at which code must
// absolutely break, and are set up from the initial options parameter. Breaking
// happens when _currentLine.length approaches these values, as mentioned above.
/**
* Send a header line consisting of the first N characters to the handler.
*
* If the count parameter is missing, then we presume that the current header
* value being emitted is done and therefore we should not send a continuation
* space. Otherwise, we presume that we're still working, so we will send the
* continuation space.
*
* @param {integer} [count] the number of characters in the current line to
* include before wrapping.
*/
HeaderEmitter.prototype._commitLine = function (count) {
const isContinuing = typeof count !== "undefined";
// Split at the point, and lop off whitespace immediately before and after.
let firstN, lastN;
if (isContinuing) {
firstN = this._currentLine.slice(0, count).trimRight();
lastN = this._currentLine.slice(count).trimLeft();
} else {
firstN = this._currentLine.trimRight();
lastN = "";
}
// Send the line plus the final CRLF.
this._handler.deliverData(firstN + "\r\n");
// Fill the start of the line with the new data.
this._currentLine = lastN;
// If this is a continuation, add an extra space at the beginning of the line.
// Adjust the breakpoint shift amount as well.
if (isContinuing) {
this._currentLine = " " + this._currentLine;
}
// We will always break at a point at or after the _preferredBreakpoint, if it
// exists, so this always gets reset to 0.
this._preferredBreakpoint = 0;
};
/**
* Reserve at least length characters in the current line. If there aren't
* enough characters, insert a line break.
*
* @param {integer} resLength - The number of characters to reserve space for.
* @returns {boolean} whether or not there is enough space for length characters.
*/
HeaderEmitter.prototype._reserveTokenSpace = function (resLength) {
// We are not going to do a sanity check that length is within the wrap
// margins. The rationale is that this lets code simply call this function to
// force a higher-level line break than normal preferred line breaks (see
// addAddress for an example use). The text that would be added may need to be
// itself broken up, so it might not need all the length anyways, but it
// starts the break already.
// If we have enough space, we don't need to do anything.
if (this._currentLine.length + resLength <= this._softMargin) {
return true;
}
// If we have a preferred breakpoint, commit the line at that point, and see
// if that is sufficient line-breaking.
if (this._preferredBreakpoint > 0) {
this._commitLine(this._preferredBreakpoint);
if (this._currentLine.length + resLength <= this._softMargin) {
return true;
}
}
// At this point, we can no longer keep within the soft margin. Let us see if
// we can fit within the hard margin.
if (this._currentLine.length + resLength <= this._hardMargin) {
return true;
}
// Adding the text to length would violate the hard margin as well. Break at
// the last emergency breakpoint.
if (this._currentLine.length > 0) {
this._commitLine(this._currentLine.length);
}
// At this point, if there is still insufficient room in the hard margin, we
// can no longer do anything to encode this word. Bail.
return this._currentLine.length + resLength <= this._hardMargin;
};
/**
* Adds a block of text to the current header, inserting a break if necessary.
* If mayBreakAfter is true and text does not end in whitespace, a single space
* character may be added to the output. If the text could not be added without
* violating line length restrictions, an error is thrown instead.
*
* @param {string} text - The text to add to the output.
* @param {boolean} mayBreakAfter - If true, the end of this text is a
* preferred breakpoint.
*/
HeaderEmitter.prototype.addText = function (text, mayBreakAfter) {
// Try to reserve space for the tokens. If we can't, give up.
if (!this._reserveTokenSpace(text.length)) {
throw new Error("Cannot encode " + text + " due to length.");
}
this._currentLine += text;
if (mayBreakAfter) {
// Make sure that there is an extra space if text could break afterwards.
this._preferredBreakpoint = this._currentLine.length;
if (text[text.length - 1] != " ") {
this._currentLine += " ";
}
}
};
/**
* Adds a block of text that may need quoting if it contains some character in
* qchars. If it is already quoted, no quoting will be applied. If the text
* cannot be added without violating maximum line length, an error is thrown
* instead.
*
* @param {string} text - The text to add to the output.
* @param {string} qchars - The set of characters that cannot appear outside
* of a quoted string.
* @param {boolean} mayBreakAfter If true, the end of this text is a preferred
* breakpoint.
*/
HeaderEmitter.prototype.addQuotable = function (text, qchars, mayBreakAfter) {
// No text -> no need to be quoted (prevents strict warning errors).
if (text.length == 0) {
return;
}
// Figure out if we need to quote the string. Don't quote a string which
// already appears to be quoted.
let needsQuote = false;
if (!(text[0] == '"' && text[text.length - 1] == '"') && qchars != "") {
for (let i = 0; i < text.length; i++) {
if (qchars.includes(text[i])) {
needsQuote = true;
break;
}
}
}
if (needsQuote) {
text = '"' + text.replace(/["\\]/g, "\\$&") + '"';
}
this.addText(text, mayBreakAfter);
};
/**
* Adds a block of text that corresponds to the phrase production in RFC 5322.
* Such text is a sequence of atoms, quoted-strings, or RFC-2047 encoded-words.
* This method will preprocess input to normalize all space sequences to a
* single space. If the text cannot be added without violating maximum line
* length, an error is thrown instead.
*
* @param {string} text - The text to add to the output.
* @param {string} qchars - The set of characters that cannot appear outside
* of a quoted string.
* @param {boolean} mayBreakAfter If true, the end of this text is a preferred
* breakpoint.
*/
HeaderEmitter.prototype.addPhrase = function (text, qchars, mayBreakAfter) {
// Collapse all whitespace spans into a single whitespace node.
text = text.replace(/[ \t\r\n]+/g, " ");
// If we have non-ASCII text, encode it using RFC 2047.
if (this._useASCII && nonAsciiRe.test(text)) {
this.encodeRFC2047Phrase(text, mayBreakAfter);
return;
}
// If quoting the entire string at once could fit in the line length, then do
// so. The check here is very loose, but this will inform is if we are going
// to definitely overrun the soft margin.
if (this._currentLine.length + text.length < this._softMargin) {
try {
this.addQuotable(text, qchars, mayBreakAfter);
// If we don't have a breakpoint, and the text is encoded as a sequence of
// atoms (and not a quoted-string), then make the last space we added a
// breakpoint, regardless of the mayBreakAfter setting.
if (this._preferredBreakpoint == 0 && text.includes(" ")) {
if (this._currentLine[this._currentLine.length - 1] != '"') {
this._preferredBreakpoint = this._currentLine.lastIndexOf(" ");
}
}
return;
} catch (e) {
// If we get an error at this point, we failed to add the quoted string
// because the string was too long. Fall through to the case where we know
// that the input was too long to begin with.
}
}
// If the text is too long, split the quotable string at space boundaries and
// add each word individually. If we still can't add all those words, there is
// nothing that we can do.
const words = text.split(" ");
for (let i = 0; i < words.length; i++) {
this.addQuotable(
words[i],
qchars,
i == words.length - 1 ? mayBreakAfter : true
);
}
};
// A regular expression for characters that need to be encoded.
var nonAsciiRe = /[^\x20-\x7e]/;
// The beginnings of RFC 2047 encoded-word
var b64Prelude = "=?UTF-8?B?";
var qpPrelude = "=?UTF-8?Q?";
// A list of ASCII characters forbidden in RFC 2047 encoded-words
var qpForbidden = "\"#$%&'(),.:;<=>?@[\\]^_`{|}~";
var hexString = "0123456789ABCDEF";
/**
* Add a block of text as a single RFC 2047 encoded word. This does not try to
* split words if they are too long.
*
* @param {Uint8Array} encodedText - The octets to encode.
* @param {boolean} useQP - If true, use quoted-printable; if false,
* use base64.
* @param {boolean} mayBreakAfter If true, the end of this text is a
* preferred breakpoint.
*/
HeaderEmitter.prototype._addRFC2047Word = function (
encodedText,
useQP,
mayBreakAfter
) {
const binaryString = mimeutils.typedArrayToString(encodedText);
let token;
if (useQP) {
token = qpPrelude;
for (let i = 0; i < encodedText.length; i++) {
if (
encodedText[i] < 0x20 ||
encodedText[i] >= 0x7f ||
qpForbidden.includes(binaryString[i])
) {
const ch = encodedText[i];
token += "=" + hexString[(ch & 0xf0) >> 4] + hexString[ch & 0x0f];
} else if (binaryString[i] == " ") {
token += "_";
} else {
token += binaryString[i];
}
}
token += "?=";
} else {
token = b64Prelude + btoa(binaryString) + "?=";
}
this.addText(token, mayBreakAfter);
};
/**
* Add a block of text as potentially several RFC 2047 encoded-word tokens.
*
* @param {string} text - The text to add to the output.
* @param {boolean} mayBreakAfter - If true, the end of this text is a preferred
* breakpoint.
*/
HeaderEmitter.prototype.encodeRFC2047Phrase = function (text, mayBreakAfter) {
// Start by encoding the text into UTF-8 directly.
const encodedText = new TextEncoder("UTF-8").encode(text);
// Make sure there's enough room for a single token.
const minLineLen = b64Prelude.length + 10; // Eight base64 characters plus ?=
if (!this._reserveTokenSpace(minLineLen)) {
this._commitLine(this._currentLine.length);
}
// Try to encode as much UTF-8 text as possible in each go.
let b64Len = 0,
qpLen = 0,
start = 0;
let maxChars =
this._softMargin - this._currentLine.length - (b64Prelude.length + 2);
for (let i = 0; i < encodedText.length; i++) {
let b64Inc = 0,
qpInc = 0;
// The length we need for base64 is ceil(length / 3) * 4...
if ((i - start) % 3 == 0) {
b64Inc += 4;
}
// The length for quoted-printable is 3 chars only if encoded
if (
encodedText[i] < 0x20 ||
encodedText[i] >= 0x7f ||
qpForbidden.includes(String.fromCharCode(encodedText[i]))
) {
qpInc = 3;
} else {
qpInc = 1;
}
if (b64Len + b64Inc > maxChars && qpLen + qpInc > maxChars) {
// Oops, we have too many characters! We need to encode everything through
// the current character. However, we can't split in the middle of a
// multibyte character. In UTF-8, characters that start with 10xx xxxx are
// the middle of multibyte characters, so backtrack until the start
// character is legal.
while ((encodedText[i] & 0xc0) == 0x80) {
--i;
}
// Add this part of the word and then make a continuation.
this._addRFC2047Word(
encodedText.subarray(start, i),
b64Len >= qpLen,
true
);
// Reset the array for parsing.
start = i;
--i; // Reparse this character as well
b64Len = qpLen = 0;
maxChars = this._softMargin - b64Prelude.length - 3;
} else {
// Add the counts for the current variable to the count to encode.
b64Len += b64Inc;
qpLen += qpInc;
}
}
// Add the entire array at this point.
this._addRFC2047Word(
encodedText.subarray(start),
b64Len >= qpLen,
mayBreakAfter
);
};
// High-level methods
// ------------------
/**
* Add the header name, with the colon and trailing space, to the output.
*
* @param {string} headerName - The name of the header.
*/
HeaderEmitter.prototype.addHeaderName = function (headerName) {
this._currentLine = this._currentLine.trimRight();
if (this._currentLine.length > 0) {
this._commitLine();
}
this.addText(headerName + ": ", false);
};
/**
* Add a header and its structured value to the output.
*
* The name can be any case-insensitive variant of a known structured header;
* the output will include the preferred name of the structure instead of the
* case put into the name. If no structured encoder can be found, and the input
* value is a string, then the header is assumed to be unstructured and the
* value is added as if {@link addUnstructured} were called.
*
* @param {string} headerName - The name of the header.
* @param {string} value - The structured value of the header.
*/
HeaderEmitter.prototype.addStructuredHeader = function (headerName, value) {
const lowerName = headerName.toLowerCase();
if (encoders.has(lowerName)) {
this.addHeaderName(preferredSpellings.get(lowerName));
encoders.get(lowerName).call(this, value);
} else if (typeof value === "string") {
// Assume it's an unstructured header.
// All-lower-case-names are ugly, so capitalize first letters.
headerName = headerName.replace(/(^|-)[a-z]/g, function (match) {
return match.toUpperCase();
});
this.addHeaderName(headerName);
this.addUnstructured(value);
} else {
throw new Error("Unknown header: " + headerName);
}
};
/**
* Add a single address to the header. The address is an object consisting of a
* possibly-empty display name and an email address.
*
* @param {Address} addr - The address to be added.
* @param {string} addr.name - The (possibly-empty) name of the address to add.
* @param {string} addr.email The email of the address to add.
* @see headerparser.parseAddressingHeader
*/
HeaderEmitter.prototype.addAddress = function (addr) {
// If we have a display name, add that first.
if (addr.name) {
// This is a simple estimate that keeps names on one line if possible.
this._reserveTokenSpace(addr.name.length + addr.email.length + 3);
this.addPhrase(addr.name, ',()<>[]:;@."', true);
// If we don't have an email address, don't write out the angle brackets for
// the address. It's already an abnormal situation should this appear, and
// this has better round-tripping properties.
if (!addr.email) {
return;
}
this.addText("<", false);
}
// Find the local-part and domain of the address, since the local-part may
// need to be quoted separately. Note that the @ goes to the domain, so that
// the local-part may be quoted if it needs to be.
const at = addr.email.lastIndexOf("@");
let localpart = "",
domain = "";
if (at == -1) {
localpart = addr.email;
} else {
localpart = addr.email.slice(0, at);
domain = addr.email.slice(at);
}
this.addQuotable(localpart, '()<>[]:;@\\," !', false);
this.addText(domain + (addr.name ? ">" : ""), false);
};
/**
* Add an array of addresses and groups to the output. Such an array may be
* found as the output of {@link headerparser.parseAddressingHeader}. Each
* element is either an address (an object with properties name and email), or a
* group (an object with properties name and group).
*
* @param {(Address|Group)[]} addresses - A collection of addresses to add.
* @param {string} addresses[].name - The (possibly-empty) name of the
* address or the group to add.
* @param {string} [addresses[].email] - The email of the address to add.
* @param {Address[]} [addresses[].group] - A list of email addresses in the group.
* @see HeaderEmitter.addAddress
* @see headerparser.parseAddressingHeader
*/
HeaderEmitter.prototype.addAddresses = function (addresses) {
let needsComma = false;
for (const addr of addresses) {
// Add a comma if this is not the first element.
if (needsComma) {
this.addText(", ", true);
}
needsComma = true;
if ("email" in addr) {
this.addAddress(addr);
} else {
// A group has format name: member, member;
// Note that we still add a comma after the group is completed.
this.addPhrase(addr.name, ',()<>[]:;@."', false);
this.addText(":", true);
this.addAddresses(addr.group);
this.addText(";", true);
}
}
};
/**
* Add an unstructured header value to the output. This effectively means only
* inserting line breaks were necessary, and using RFC 2047 encoding where
* necessary.
*
* @param {string} text - The text to add to the output.
*/
HeaderEmitter.prototype.addUnstructured = function (text) {
if (text.length == 0) {
return;
}
// Unstructured text is basically a phrase that can't be quoted. So, if we
// have nothing in qchars, nothing should be quoted.
this.addPhrase(text, "", false);
};
/** RFC 822 labels for days of the week. */
var kDaysOfWeek = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"];
/**
* Formatting helper to output numbers between 0-9 as 00-09 instead.
*/
function padTo2Digits(num) {
return num < 10 ? "0" + num : num.toString();
}
/**
* Add a date/time field to the output, using the JS date object as the time
* representation. The value will be output using the timezone offset of the
* date object, which is usually the timezone of the user (modulo timezone and
* DST changes).
*
* Note that if the date is an invalid date (its internal date parameter is a
* NaN value), this method throws an error instead of generating an invalid
* string.
*
* @param {Date} date - The date to be added to the output string.
*/
HeaderEmitter.prototype.addDate = function (date) {
// Rather than make a header plastered with NaN values, throw an error on
// specific invalid dates.
if (isNaN(date.getTime())) {
throw new Error("Cannot encode an invalid date");
}
let fullYear,
month,
dayOfMonth,
dayOfWeek,
hours,
minutes,
seconds,
tzOffset;
if (this._sanitizeDate) {
fullYear = date.getUTCFullYear();
month = date.getUTCMonth();
dayOfMonth = date.getUTCDate();
dayOfWeek = date.getUTCDay();
hours = date.getUTCHours();
minutes = date.getUTCMinutes();
// To reduce the chance of fingerprinting the clock offset,
// round the time down to the nearest minute.
seconds = 0;
tzOffset = 0;
} else {
fullYear = date.getFullYear();
month = date.getMonth();
dayOfMonth = date.getDate();
dayOfWeek = date.getDay();
hours = date.getHours();
minutes = date.getMinutes();
seconds = date.getSeconds();
tzOffset = date.getTimezoneOffset();
}
// RFC 5322 says years can't be before 1900. The after 9999 is a bit that
// derives from the specification saying that years have 4 digits.
if (fullYear < 1900 || fullYear > 9999) {
throw new Error("Date year is out of encodable range");
}
// Start by computing the timezone offset for a day. We lack a good format, so
// the the 0-padding is done by hand. Note that the tzoffset we output is in
// the form ±hhmm, so we need to separate the offset (in minutes) into an hour
// and minute pair.
const tzOffHours = Math.abs(Math.trunc(tzOffset / 60));
const tzOffMinutes = Math.abs(tzOffset) % 60;
const tzOffsetStr =
(tzOffset > 0 ? "-" : "+") +
padTo2Digits(tzOffHours) +
padTo2Digits(tzOffMinutes);
// Convert the day-time figure into a single value to avoid unwanted line
// breaks in the middle.
const dayTime = [
kDaysOfWeek[dayOfWeek] + ",",
dayOfMonth,
mimeutils.MONTH_NAMES[month],
fullYear,
padTo2Digits(hours) +
":" +
padTo2Digits(minutes) +
":" +
padTo2Digits(seconds),
tzOffsetStr,
].join(" ");
this.addText(dayTime, false);
};
/**
* Signal that the current header has been finished encoding.
*
* @param {boolean} deliverEOF - If true, signal to the handler that no more
* text will be arriving.
*/
HeaderEmitter.prototype.finish = function (deliverEOF) {
this._commitLine();
if (deliverEOF) {
this._handler.deliverEOF();
}
};
/**
* Make a streaming header emitter that outputs on the given handler.
*
* @param {StreamHandler} handler The handler to consume output
* @param {object} options - Options for the HeaderEmitter constructor.
* @returns {HeaderEmitter} a header emitter constructed with the given options.
*/
function makeStreamingEmitter(handler, options) {
return new HeaderEmitter(handler, options);
}
function StringHandler() {
this.value = "";
this.deliverData = function (str) {
this.value += str;
};
this.deliverEOF = function () {};
}
/**
* Given a header name and its structured value, output a string containing its
* MIME-encoded value. The trailing CRLF for the header is included.
*
* @param {string} headerName - The name of the structured header.
* @param {string} value - The value of the structured header.
* @param {object} options - Options for the HeaderEmitter constructor.
* @returns {string} A MIME-encoded representation of the structured header.
* @see {HeaderEmitter.addStructuredHeader}
*/
function emitStructuredHeader(headerName, value, options) {
const handler = new StringHandler();
const emitter = new HeaderEmitter(handler, options);
emitter.addStructuredHeader(headerName, value);
emitter.finish(true);
return handler.value;
}
/**
* Given a map of header names and their structured values, output a string
* containing all of their headers and their MIME-encoded values.
*
* This method is designed to be able to emit header values given the headerData
* values produced by MIME parsing. Thus, the values of the map are arrays
* corresponding to header multiplicity.
*
* @param {Map<string,object[]>} headerValues - A map of header names to
* arrays of their structured values.
* @param {object} options - Options for the HeaderEmitter constructor.
* @returns {string} A MIME-encoded representation of the structured header.
* @see {HeaderEmitter.addStructuredHeader}
*/
function emitStructuredHeaders(headerValues, options) {
const handler = new StringHandler();
const emitter = new HeaderEmitter(handler, options);
for (const instance of headerValues) {
instance[1].forEach(function (e) {
emitter.addStructuredHeader(instance[0], e);
});
}
emitter.finish(true);
return handler.value;
}
/**
* Add a custom structured MIME encoder to the set of known encoders. These
* encoders are used for {@link emitStructuredHeader} and similar functions to
* encode richer, more structured values instead of relying on string
* representations everywhere.
*
* Structured encoders are functions which take in a single parameter
* representing their structured value. The this parameter is set to be an
* instance of {@link HeaderEmitter}, and it is intended that the several public
* or protected methods on that class are useful for encoding values.
*
* There is a large set of structured encoders built-in to the jsmime library
* already.
*
* @param {string} header - The header name (in its preferred case) for
* which the encoder will be used.
* @param {Function} encoder - The structured encoder function.
* Takes string value as input.
*/
function addStructuredEncoder(header, encoder) {
const lowerName = header.toLowerCase();
encoders.set(lowerName, encoder);
if (!preferredSpellings.has(lowerName)) {
preferredSpellings.set(lowerName, header);
}
}
return Object.freeze({
addStructuredEncoder,
emitStructuredHeader,
emitStructuredHeaders,
makeStreamingEmitter,
});
};
const headeremitter = JsMIMEheaderemitter();
// "Extra" structured MIME headers.
function parseNewsgroups(headers) {
let ng = [];
for (const header of headers) {
ng = ng.concat(header.split(/\s*,\s*/));
}
return ng;
}
function emitNewsgroups(groups) {
// Don't encode the newsgroups names in RFC 2047...
if (groups.length == 1) {
this.addText(groups[0], false);
} else {
this.addText(groups[0], false);
for (let i = 1; i < groups.length; i++) {
this.addText(",", false); // only comma, no space!
this.addText(groups[i], false);
}
}
}
headerparser.addStructuredDecoder("Newsgroups", parseNewsgroups);
headerparser.addStructuredDecoder("Followup-To", parseNewsgroups);
headeremitter.addStructuredEncoder("Newsgroups", emitNewsgroups);
headeremitter.addStructuredEncoder("Followup-To", emitNewsgroups);
export const jsmime = {
mimeutils,
MimeParser,
headerparser,
headeremitter,
};