Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "MboxMsgInputStream.h"
#include "nsString.h"
#include "nsMsgUtils.h"
#include "nsTArray.h"
#include "mozilla/Logging.h"
#include "LineReader.h" // For FirstLine().
#include <algorithm>
#include <functional>
#include <limits>
extern mozilla::LazyLogModule gMboxLog;
using mozilla::LogLevel;
/**
* MboxParser is a helper class to manage parsing messages out of an mbox
* byte stream.
* Call Feed() to pass data into the parser, in chunks of at least MinChunk
* size. Pass in a chunk less than MinChunk size to indicate EOF of the mbox
* file (a zero-size chunk is fine). The resulting message is written to a
* growable output buffer and accessed via Drain(). Use Available() to see how
* many bytes are ready to drain. When a complete message has been parsed,
* parsing will halt, and further calls to Feed() will consume no more data.
* However, the message is not considered 'finished' until it has been
* completely read from the output buffer (via Drain()). At this point,
* IsFinished() will return true. To continue with the next message, you can
* then call Kick() to resume the parsing. AtEOF() will return true when all
* messages have been parsed (and drained).
*
* Goals:
* - Assume well formed mboxrd format, but try to handle other variants
* and don't freak out when malformed data is encountered.
* - Don't choke on invalid messages. Just treat mbox as a container format
* and aim to return messages as accurately as possible, even if malformed.
* - Avoid copying and memory reallocation as much as possible.
* - Cope with pathological cases without buffering up huge quantities of data.
* eg "From " followed by gigabytes of non-EOL characters.
* Output buffer size is kept down to roughly what you pass in with a
* single call to Feed().
*
* Note:
* It'd be nice to ditch the output buffer and the extra copy involved, but
* that'd require the caller passing in an output buffer, and the parser would
* have to break off parsing when that buffer is full. It could be done, but
* the extra complexity probably isn't worth it...
*/
class MboxParser {
public:
using span = mozilla::Span<const char>;
static constexpr size_t InitialOutBufSize = 8192;
MboxParser() : mOutBuffer(InitialOutBufSize), mCursor(0) {}
/**
* Returns the number of chars available for reading by Drain().
*/
size_t Available() const { return mOutBuffer.Length() - mCursor; }
/**
* Returns true when a complete message has been parsed and read out
* via Drain().
*/
bool IsFinished() const {
return Available() == 0 && (mState == eEOF || mState == eMessageComplete);
}
/**
* Returns true when the end of the mbox has been reached (and the last
* message has been completely read out via Drain()).
*/
bool AtEOF() const { return Available() == 0 && mState == eEOF; }
/**
* MinChunk is the minimum amount of data callers should pass into Feed().
* If less than MinChunk is passed in, Feed() knows that there will be no
* more data to come (i.e. EOF).
* It is chosen to be a reasonable minimum for our end-of-message
* heuristic (A "From " line followed by a couple of likely-looking header
* lines).
* Note: This is just a guideline minimum value for callers. In practice,
* sensible callers would aim to feed in chunks much larger than this.
*/
static constexpr size_t MinChunk = 128;
/**
* Feed a chunk of data into the parser for processing.
* Returns any portion of the data which was unused.
* Expects at least MinChunk bytes. Passing in less than MinChunk bytes
* indicates that EOF is on the horizon - the end of the mbox file.
*
* Calling Feed() is no guarantee that output will be ready via
* Available()/Drain(). For example, "From "-separator lines produce no
* output.
*
* It is an error to call Feed() if data is available to Drain().
*
* If a complete message has been parsed, Feed() will consume no further
* data until the message output has been drained and the parsing is
* restarted via Kick().
*/
span Feed(span data) {
MOZ_LOG(gMboxLog, LogLevel::Verbose,
("MboxParser - Feed() %zu bytes: '%s')", data.Length(),
CEscapeString(nsDependentCSubstring(data), 80).get()));
MOZ_ASSERT(Available() == 0);
// Is this the end of the mbox?
bool endOfMbox = data.Length() < MinChunk;
// Loop until we've used up all the data we can.
while (true) {
// If a message is complete (or the mbox is finished), then
// we stall.
if (mState == eMessageComplete || mState == eEOF) {
break;
}
// Have the current state use up as much data as it needs.
data = handle(data);
if (data.Length() < MinChunk) {
if (data.IsEmpty()) {
break;
}
if (!endOfMbox) {
// We know there's more data to come, so go away and come back
// when theres >=MinChunk.
break;
}
}
}
return data;
}
/**
* Drain() reads processed data out of parser into buf.
* It'll produce a maximum of `count` bytes.
* The number of bytes actually read is returned.
*/
size_t Drain(char* buf, size_t count) {
size_t n = std::min(count, Available());
auto start = mOutBuffer.cbegin() + mCursor;
std::copy(start, start + n, buf);
mCursor += n;
MOZ_ASSERT(mCursor <= mOutBuffer.Length());
// If only a small proportion (<25%) has been left unconsumed, move it to
// the beginning of the buffer. Ideally, the caller would drain all
// available data in one go, but that's not always possible.
if (Available() < (mOutBuffer.Length() / 4)) {
mOutBuffer.RemoveElementsAt(0, mCursor);
mCursor = 0;
}
return n;
}
/**
* When a message has been completely parsed and drained,
* Kick() can be called to resume parsing for the next message (if any).
*/
void Kick() {
MOZ_ASSERT(IsFinished());
if (mState == eMessageComplete) {
mState = eExpectFromLine;
}
}
private:
// Processed data is stored here, ready to be read out by Drain().
nsTArray<char> mOutBuffer;
// Start of unread data within mOutBuffer.
size_t mCursor{0};
// Number of '>' characters at start of line, for eCountQuoting state.
int mQuoteCnt{0};
// Our states. In general, the Expect* states don't consume any data -
// they just sniff data and move to a new state accordingly.
enum {
eExpectFromLine = 0, // We start here.
eDiscardFromLine,
eExpectHeaderLine,
eEmitHeaderLine,
eEmitSeparator, // Blank line between header and body.
eExpectBodyLine,
eCountQuoting, // Line starts with one or more '>' chars.
eEmitQuoting,
eEmitBodyLine,
eMessageComplete, // Message is complete (or ended prematurely).
eEOF, // End of mbox.
} mState{eExpectFromLine};
// handle_<state>() functions consume as much data as they need, and
// return whatever is left over.
// If they are given <MinChunk bytes, they are free to assume the end
// the mbox file has been reached.
span handle(span data) {
{
const char* stateName[] = {"eExpectFromLine",
"eDiscardFromLine",
"eExpectHeaderLine",
"eEmitHeaderLine",
"eEmitSeparator",
"eExpectBodyLine",
"eCountQuoting",
"eEmitQuoting",
"eEmitBodyLine",
"eMessageComplete",
"eEOF"};
MOZ_LOG(gMboxLog, LogLevel::Verbose,
("MboxParser - handle %s (%zu bytes: '%s')", stateName[mState],
data.Length(),
CEscapeString(nsDependentCSubstring(data), 80).get()));
}
switch (mState) {
case eExpectFromLine:
return handle_eExpectFromLine(data);
case eDiscardFromLine:
return handle_eDiscardFromLine(data);
case eExpectHeaderLine:
return handle_eExpectHeaderLine(data);
case eEmitHeaderLine:
return handle_eEmitHeaderLine(data);
case eEmitSeparator:
return handle_eEmitSeparator(data);
case eExpectBodyLine:
return handle_eExpectBodyLine(data);
case eCountQuoting:
return handle_eCountQuoting(data);
case eEmitQuoting:
return handle_eEmitQuoting(data);
case eEmitBodyLine:
return handle_eEmitBodyLine(data);
case eMessageComplete:
return handle_eMessageComplete(data);
case eEOF:
return handle_eEOF(data);
default:
MOZ_ASSERT_UNREACHABLE(); // should not happen
}
}
// We're expecting a new message to start, or an EOF.
span handle_eExpectFromLine(span data) {
if (data.Length() < 5) { // Enough to check for "From "?
mState = eEOF; // no more messages.
return span(); // discard data
}
if (IsFromLine(data)) {
mState = eDiscardFromLine;
} else {
MOZ_LOG(gMboxLog, LogLevel::Warning,
("MboxParser - Missing 'From ' separator"));
// Just jump straight to header phase.
mState = eExpectHeaderLine;
}
return data;
}
// Ditch the "From " line.
// (Pathological case: "From " followed by gigabyte-length line).
span handle_eDiscardFromLine(span data) {
if (data.IsEmpty()) {
return PrematureEOF(data);
}
bool hitEOL;
data = DiscardUntilEOL(data, hitEOL);
if (hitEOL) {
mState = eExpectHeaderLine;
}
return data;
}
// Decide if we're still in the header block.
// We don't need to worry about folded lines. Any non-blank line is just
// treated as a header line and output verbatim.
span handle_eExpectHeaderLine(span data) {
if (data.Length() < 2) {
return PrematureEOF(data);
}
// Start with an EOL? (CRLF or LF)
size_t eolSize = SniffEOLAtStart(data);
if (eolSize > 0) {
mState = eEmitSeparator; // Yes. Line is blank.
} else {
mState = eEmitHeaderLine;
}
return data;
}
// Output a single header line.
span handle_eEmitHeaderLine(span data) {
if (data.IsEmpty()) {
return PrematureEOF(data);
}
bool hitEOL;
data = EmitUntilEOL(data, hitEOL);
if (hitEOL) {
mState = eExpectHeaderLine;
}
return data;
}
// We're emitting the blank line separating header and body.
span handle_eEmitSeparator(span data) {
if (data.IsEmpty()) {
return PrematureEOF(data);
}
bool hitEOL;
data = EmitUntilEOL(data, hitEOL);
// We wouldn't be here unless an EOL was found (see eExpectHeaderLine).
MOZ_ASSERT(hitEOL == true);
mState = eExpectBodyLine;
return data;
}
// Decide if we're still in body or if end of message has been hit.
// While there _should_ be a blank line after each message, before the
// "From " separator... we can't rely on that.
// If there is a blank line at the end of the message it should be stripped.
span handle_eExpectBodyLine(span data) {
if (data.IsEmpty()) {
// Actual EOF, so we're done (it'll advance to eEOF once the message
// is drained and we go to look for the next one).
mState = eMessageComplete;
return data;
}
// Need to unescape lines beginning ">From " (or ">>>>From " etc).
// (Pathological case: so many leading '>' chars that we don't see
// anything else in the buffer. So use a separate state to count them).
if (data[0] == '>') {
mQuoteCnt = 0;
mState = eCountQuoting;
return data;
}
// Check for blank line.
size_t n = SniffEOLAtStart(data);
if (n == data.Length()) {
// EOF. Suppress last blank line.
mState = eMessageComplete;
return data.From(n);
}
// Is it an unescaped "From " (optionally with a preceding blank line)?
// A line beginning with "From " is end of message according to spec,
// But we want to be really really sure, so we can support some cases
// where it's just a badly-encoded message body.
if (IsReallyReallyFromLine(data.From(n))) {
mState = eMessageComplete;
// If there was a preceding blank line, suppress it.
return data.From(n);
}
// Just output the line as it is.
mState = eEmitBodyLine;
return data;
}
span handle_eEmitBodyLine(span data) {
if (data.IsEmpty()) {
return PrematureEOF(data);
}
bool hitEOL;
data = EmitUntilEOL(data, hitEOL);
if (hitEOL) {
mState = eExpectBodyLine;
}
return data;
}
// Soak up and count '>' quote chars.
// (pathological case: line starting with gigabytes of repeated '>' char)
span handle_eCountQuoting(span data) {
if (data.IsEmpty()) {
// Uhoh, EOF. Write out the chars we held back, then bail out.
while (mQuoteCnt > 0) {
--mQuoteCnt;
Emit(span(">", 1));
}
return PrematureEOF(data);
}
auto is_quote = [](char c) { return c == '>'; };
auto firstNonQuote = std::find_if_not(data.cbegin(), data.cend(), is_quote);
auto n = firstNonQuote - data.cbegin();
mQuoteCnt += n;
if (firstNonQuote != data.cend()) {
// We hit the end of the quotes.
mState = eEmitQuoting;
}
return span(firstNonQuote, data.cend());
}
// Spit out appropriate quoting for upcoming body line.
span handle_eEmitQuoting(span data) {
if (data.IsEmpty()) {
// Uhoh. Write out the chars we held back, then bail out.
while (mQuoteCnt > 0) {
--mQuoteCnt;
Emit(span(">", 1));
}
return PrematureEOF(data);
}
// Body line continues with "From "?
if (IsFromLine(data)) {
// Yes! We need to remove a '>' to unescape it.
MOZ_ASSERT(mQuoteCnt > 0);
--mQuoteCnt;
}
// Write out the '>' chars we held back.
while (mQuoteCnt > 0) {
--mQuoteCnt;
Emit(span(">", 1));
}
// Output the rest of the line as a normal body line.
mState = eEmitBodyLine;
return data;
}
// All done, so this is a no-op.
span handle_eMessageComplete(span data) {
if (data.IsEmpty()) {
mState = eEOF;
} else {
mState = eExpectFromLine;
}
return data;
}
// All done, so this is a no-op.
span handle_eEOF(span data) {
MOZ_ASSERT(data.IsEmpty());
return data;
}
// Helper for when we hit unexpected EOF.
// Log it, output remaining data, and go into eMessageComplete state.
span PrematureEOF(span data) {
MOZ_LOG(gMboxLog, LogLevel::Warning, ("MboxParser - PrematureEOF"));
// We don't go directly to eEOF.
// Going to eMessageComplete holds parsing up until the output
// has all been drained.
// After this, eExpectFromLine will move us into eEOF.
mState = eMessageComplete;
Emit(data);
return data.Last<0>();
}
// Discard all data up to (and including) an EOL.
// hitEOL is set if the end of the line is encountered.
span DiscardUntilEOL(span data, bool& hitEOL) {
hitEOL = false;
auto eol = std::find(data.cbegin(), data.cend(), '\n');
if (eol != data.cend()) {
hitEOL = true;
++eol; // Include '\n' in discard.
}
auto n = eol - data.cbegin();
return data.From(n);
}
// Emit all data up to (and including) an EOL.
// hitEOL is set if the end of the line is encountered.
span EmitUntilEOL(span data, bool& hitEOL) {
hitEOL = false;
auto eol = std::find(data.cbegin(), data.cend(), '\n');
if (eol != data.cend()) {
hitEOL = true;
++eol; // Include '\n'.
}
auto n = eol - data.cbegin();
Emit(data.First(n));
return data.From(n);
}
// Emit a chunk of data, to be picked up by Drain().
void Emit(span data) { mOutBuffer.AppendElements(data); }
// Check for "From " at start of data.
static bool IsFromLine(span data) {
if (data.Length() < 5) {
return false;
}
nsDependentCSubstring cookie(data.First<5>());
return cookie.EqualsLiteral("From ");
}
// Check for an EOL sequence at the start of data.
// Returns size of EOL sequence found: 0=none, 1=LF, 2=CRLF.
static size_t SniffEOLAtStart(span data) {
if (data.Length() >= 1 && data[0] == '\n') {
return 1;
}
if (data.Length() >= 2 && data[0] == '\r' && data[1] == '\n') {
return 2;
}
return 0;
}
// A more rigorous "From " check which tries to detect spurious cases
// where a message body hasn't been properly escaped.
// The heuristic we use:
// If the "From " line is followed by two lines which look like headers,
// treat it as a message separator.
// Otherwise assume it's part of the message body.
// NOTE: the incoming data might not include two complete headers.
// If that's the case and we don't spot anything that's obviously _not_
// a header, then we'll give it the benefit of the doubt.
// It would be more rigorous to implement this as its own state, so
// it's not restricted to the size of the buffer passed in (>=MinChunk).
// But that would require buffering up the data that passed through, in
// order to roll back once the decision has been made.
// And it's just not worth the extra complexity for such an obscure case.
// Rationale:
// 1) We're just trying to catch malformed mboxes already in the wild. We
// shouldn't have this problem if the mbox was written out properly
// (i.e. written by Thunderbird!).
// 2) We should be using big buffers, and the likelihood of hitting a
// malformed message at a read boundary is teeny tiny.
// So it's not worth jumping through toooooo many hoops here.
static bool IsReallyReallyFromLine(span data) {
if (!IsFromLine(data)) {
return false;
}
auto it = data.cbegin();
auto end = data.cend();
// Skip past the "From " line
it = std::find(it, end, '\n');
if (it == end) {
// "From " line takes up entirety of buffer.
// Done all we can, so allow benefit of the doubt.
return true;
}
++it;
// Now apply our heuristic by sniffing for mail headers.
// From RFC 5322:
// ```
// Header fields are lines beginning with a field name, followed by a
// colon (":"), followed by a field body, and terminated by CRLF. A
// field name MUST be composed of printable US-ASCII characters (i.e.,
// characters that have values between 33 and 126, inclusive), except
// colon.
// ```
auto is_fieldnamechar = [](char c) -> bool {
// return true if char is valid for a mail header name.
return c != ':' && c >= 33 && c <= 126;
};
// Check that the "From " line is followed by mail headers (2 is enough).
// If we run out of data without seeing anything that's obviously not a
// header, give it the benefit of the doubt.
for (int headercount = 0; headercount < 2; ++headercount) {
it = std::find_if_not(it, end, is_fieldnamechar);
if (it == end) {
return true;
}
if (*it != ':') {
// Line is not a valid header.
MOZ_LOG(gMboxLog, LogLevel::Warning,
("MboxParser - detected unescaped \"From \" line (data='%s')",
CEscapeString(nsDependentCSubstring(data), 80).get()));
return false;
}
++it;
// Next line.
it = std::find(it, end, '\n');
if (it == end) {
return true;
}
++it;
if (it == end) {
return true;
}
// Skip over any continued lines (folded headers).
while (*it == ' ' || *it == '\t') {
it = std::find(it, end, '\n');
if (it == end) {
return true;
}
++it;
if (it == end) {
return true;
}
}
}
return true; // That'll do nicely.
}
};
/**
* MboxMsgInputStream implementation.
*/
NS_IMPL_ISUPPORTS(MboxMsgInputStream, nsIInputStream);
MboxMsgInputStream::MboxMsgInputStream(nsIInputStream* mboxStream)
: mRawStream(mboxStream),
mStatus(NS_OK),
mBuf(8192),
mUsed(0),
mUnused(0),
mTotalUsed(0),
mMsgOffset(0),
mParser(new MboxParser()) {}
MboxMsgInputStream::~MboxMsgInputStream() { Close(); }
NS_IMETHODIMP MboxMsgInputStream::Close() {
mRawStream->Close();
mStatus = NS_BASE_STREAM_CLOSED;
return NS_OK;
}
bool MboxMsgInputStream::IsNullMessage() {
return mParser->IsFinished() && (mMsgOffset == mTotalUsed);
}
nsresult MboxMsgInputStream::Continue(bool& more) {
more = false;
// Can't continue if the stream was closed.
if (mStatus == NS_BASE_STREAM_CLOSED) {
return NS_BASE_STREAM_CLOSED;
}
MOZ_ASSERT(NS_SUCCEEDED(mStatus));
MOZ_ASSERT(mParser->IsFinished());
// Record start of the next message (or EOF).
mMsgOffset = mTotalUsed;
// Tell the parser to start on the next message
mParser->Kick();
mStatus = PumpData();
if (NS_FAILED(mStatus)) {
return mStatus;
}
if (mParser->AtEOF()) {
// No more messages.
return NS_OK;
}
more = true;
return NS_OK;
}
// Throw NS_BASE_STREAM_CLOSED if closed.
// Return 0 if EOF but not closed.
// Else return available bytes.
NS_IMETHODIMP MboxMsgInputStream::Available(uint64_t* result) {
*result = 0;
if (NS_FAILED(mStatus)) {
return mStatus;
}
mStatus = PumpData();
*result = static_cast<uint64_t>(mParser->Available());
return mStatus;
}
NS_IMETHODIMP MboxMsgInputStream::StreamStatus() { return mStatus; }
// Returns a count of 0 if EOF or closed.
// Never throws NS_BASE_STREAM_CLOSED
NS_IMETHODIMP MboxMsgInputStream::Read(char* buf, uint32_t count,
uint32_t* result) {
*result = 0;
if (mStatus == NS_BASE_STREAM_CLOSED) {
return NS_OK;
}
if (NS_FAILED(mStatus)) {
return mStatus;
}
// We just keep feeding data into the parser and copying out its output.
while (count > 0) {
mStatus = PumpData();
if (NS_FAILED(mStatus)) {
return mStatus;
}
size_t n = mParser->Drain(buf, (size_t)count);
if (n == 0) {
break; // Nothing more in this message. Return EOF.
}
MOZ_ASSERT(n <= UINT32_MAX);
buf += n;
count -= (uint32_t)n;
*result += n;
}
return NS_OK;
}
// Helper fn to feed data into the parser until there's something to drain,
// or until the end of the message has been hit.
// After calling this, mParser->Available() will only return 0 if the
// message is complete.
//
// Our read buffer (mBuf) is a fixed-size allocation, and breaks down like
// this:
// +---------------+---------------+----------------------+
// | used data | unused data | free space |
// +---------------+---------------+----------------------+
// ^ ^ ^ ^
// |<-- mUsed -->|<-- mUnused -->| mBuf.Length()
//
// Used data has been parsed already and can be ditched.
// Unused data has been read in, but not parsed yet.
// Free space is space we can read more raw data into.
//
// Obviously, the aim is to fill the buffer with each read,
// then exhaust it completely before reading more. But sometimes
// the parser will require more data before it can continue, so
// we have to "garbage collect" by moving the unused data to
// the front of the buffer to maximise the free space for reading.
// Luckily such parser stalls tend to involve small quantities of
// data (e.g. a "From " line falling between read boundaries).
nsresult MboxMsgInputStream::PumpData() {
// Feed data to the parser until there's data available to output (or until
// message is completed).
while (mParser->Available() == 0 && !mParser->IsFinished()) {
while (mUnused < MboxParser::MinChunk) {
if (mUsed > 0) {
// Shift the unused portion to the front of the buffer.
auto unused = mBuf.AsSpan().Subspan(mUsed, mUnused);
std::copy(unused.cbegin(), unused.cend(), mBuf.begin());
mUsed = 0;
}
uint32_t got;
size_t want = mBuf.Length() - (mUsed + mUnused);
nsresult rv = mRawStream->Read(mBuf.Elements() + mUnused, want, &got);
if (NS_FAILED(rv)) {
return rv;
}
if (got == 0) {
break; // EOF.
}
mUnused += got;
}
// Feed what we've got into the parser.
// If it's <MinChunk, then we've hit EOF, and the parser will handle it.
auto data = mBuf.AsSpan().Subspan(mUsed, mUnused);
data = mParser->Feed(data);
size_t consumed = mUnused - data.Length();
mTotalUsed += consumed;
mUsed += consumed;
mUnused -= consumed;
}
return NS_OK;
}
NS_IMETHODIMP MboxMsgInputStream::ReadSegments(nsWriteSegmentFun writer,
void* closure, uint32_t count,
uint32_t* _retval) {
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP MboxMsgInputStream::IsNonBlocking(bool* nonBlocking) {
*nonBlocking = false;
return NS_OK;
}