nsMsgBodyHandler.cpp

comm-central/mailnews/search/src/nsMsgBodyHandler.cpp

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "msgCore.h"

#include "nsMsgSearchCore.h"

#include "nsMsgUtils.h"

#include "nsMsgBodyHandler.h"

#include "nsMsgSearchTerm.h"

#include "nsIInputStream.h"

#include "plbase64.h"

#include "nsMimeTypes.h"

nsMsgBodyHandler::nsMsgBodyHandler(nsIMsgSearchScopeTerm* scope,

                                   nsIMsgDBHdr* msg) {

  m_scope = scope;

  // The following are variables used when the body handler is handling stuff

  // from filters....through this constructor, that is not the case so we set

  // them to NULL.

  m_headers = nullptr;

  m_headersSize = 0;

  m_Filtering = false;  // make sure we set this before we call initialize...

  Initialize();  // common initialization stuff

  nsresult rv = m_scope->GetInputStream(msg, getter_AddRefs(m_msgStream));

  NS_ENSURE_SUCCESS_VOID(rv);  // Not ideal, but warn at least.

  m_lineBuffer = mozilla::MakeUnique<nsLineBuffer<char>>();

nsMsgBodyHandler::nsMsgBodyHandler(nsIMsgSearchScopeTerm* scope,

                                   nsIMsgDBHdr* msg, const char* headers,

                                   uint32_t headersSize, bool Filtering) {

  m_scope = scope;

  m_headers = nullptr;

  m_headersSize = 0;

  m_Filtering = Filtering;

  Initialize();

  if (m_Filtering) {

    m_headers = headers;

    m_headersSize = headersSize;

  } else {

    nsresult rv = m_scope->GetInputStream(msg, getter_AddRefs(m_msgStream));

    NS_ENSURE_SUCCESS_VOID(rv);  // Not ideal, but warn at least.

    m_lineBuffer = mozilla::MakeUnique<nsLineBuffer<char>>();

void nsMsgBodyHandler::Initialize()

// common initialization code regardless of what body type we are handling...

  // Default transformations for local message search and MAPI access

  m_stripHeaders = true;

  m_partIsHtml = false;

  m_base64part = false;

  m_partIsQP = false;

  m_isMultipart = false;

  m_partIsText = true;  // Default is text/plain, maybe proven otherwise later.

  m_pastPartHeaders = false;

  m_inMessageAttachment = false;

  m_headerBytesRead = 0;

nsMsgBodyHandler::~nsMsgBodyHandler() {}

int32_t nsMsgBodyHandler::GetNextLine(nsCString& buf, nsCString& charset) {

  if (!m_Filtering && !m_msgStream) {

    return -1;  // In an invalid state, so EOF immediately.

  int32_t outLength = -1;  // length of outgoing line or -1 eof

  bool eatThisLine = true;

  while (eatThisLine) {

    nsAutoCString nextLine;

    int32_t length = -1;

    if (m_Filtering) {

      length = GetNextFilterLine(nextLine);

    } else {

      length = GetNextLocalLine(nextLine);

    if (length < 0) break;  // eof in

    outLength = ApplyTransformations(nextLine, length, eatThisLine, buf);

  if (outLength < 0) return -1;  // eof out

  // For non-multipart messages, the entire message minus headers is encoded.

  if (!m_isMultipart && m_base64part) {

    Base64Decode(buf);

    outLength = buf.Length();

    m_base64part = false;

  // Process aggregated HTML.

  if (!m_isMultipart && m_partIsHtml) {

    StripHtml(buf);

    outLength = buf.Length();

  charset = m_partCharset;

  return outLength;

int32_t nsMsgBodyHandler::GetNextFilterLine(nsCString& buf) {

  // m_nextHdr always points to the next header in the list....the list is NULL

  // terminated...

  uint32_t numBytesCopied = 0;

  if (m_headersSize > 0) {

    // #mscott. Ugly hack! filter headers list have CRs & LFs inside the NULL

    // delimited list of header strings. It is possible to have: To NULL CR LF

    // From. We want to skip over these CR/LFs if they start at the beginning of

    // what we think is another header.

    while (m_headersSize > 0 && (m_headers[0] == '\r' || m_headers[0] == '\n' ||

                                 m_headers[0] == ' ' || m_headers[0] == '\0')) {

      m_headers++;  // skip over these chars...

      m_headersSize--;

    if (m_headersSize > 0) {

      numBytesCopied = strlen(m_headers) + 1;

      buf.Assign(m_headers);

      m_headers += numBytesCopied;

      // be careful...m_headersSize is unsigned. Don't let it go negative or we

      // overflow to 2^32....*yikes*

      if (m_headersSize < numBytesCopied)

        m_headersSize = 0;

      else

        m_headersSize -= numBytesCopied;  // update # bytes we have read from

                                          // the headers list

      return (int32_t)numBytesCopied;

  } else if (m_headersSize == 0) {

    buf.Truncate();

  return -1;

// Return length of line, otherwise -1 for EOF.

// Line is returned with EOL sequence trimmed off.

int32_t nsMsgBodyHandler::GetNextLocalLine(nsACString& line) {

  bool more;

  nsresult rv = NS_ReadLine(m_msgStream.get(), m_lineBuffer.get(), line, &more);

  if (NS_FAILED(rv)) {

    return -1;

  if (line.IsEmpty() && !more) {

    return -1;  // No more data.

  return line.Length();

/**

 * This method applies a sequence of transformations to the line.

 * It applies the following sequences in order

 * * Removes headers if the searcher doesn't want them

 *   (sets m_past*Headers)

 * * Determines the current MIME type.

 *   (via SniffPossibleMIMEHeader)

 * * Strips any HTML if the searcher doesn't want it

 * * Strips non-text parts

 * * Decodes any base64 part

 *   (resetting part variables: m_base64part, m_pastPartHeaders, m_partIsHtml,

 *    m_partIsText)

 * @param line        (in)    the current line

 * @param length      (in)    the length of said line

 * @param eatThisLine (out)   whether or not to ignore this line

 * @param buf         (inout) if m_base64part, the current part as needed for

 *                            decoding; else, it is treated as an out param (a

 *                            redundant version of line).

 * @return            the length of the line after applying transformations

*/

int32_t nsMsgBodyHandler::ApplyTransformations(const nsCString& line,

                                               int32_t length,

                                               bool& eatThisLine,

                                               nsCString& buf) {

  eatThisLine = false;

  if (!m_pastPartHeaders)  // line is a line from the part headers

    if (m_stripHeaders) eatThisLine = true;

    // We have already grabbed all worthwhile information from the headers,

    // so there is no need to keep track of the current lines

    buf.Assign(line);

    SniffPossibleMIMEHeader(buf);

    if (buf.IsEmpty() || buf.First() == '\r' || buf.First() == '\n') {

      if (!m_inMessageAttachment) {

        m_pastPartHeaders = true;

      } else {

        // We're in a message attachment and have just read past the

        // part header for the attached message. We now need to read

        // the message headers and any part headers.

        // We can now forget about the special handling of attached messages.

        m_inMessageAttachment = false;

    return length;

  // Check to see if this is one of our boundary strings.

  bool matchedBoundary = false;

  if (m_isMultipart && m_boundaries.Length() > 0) {

    for (int32_t i = (int32_t)m_boundaries.Length() - 1; i >= 0; i--) {

      if (StringBeginsWith(line, m_boundaries[i])) {

        matchedBoundary = true;

        // If we matched a boundary, we won't need the nested/later ones any

        // more.

        m_boundaries.SetLength(i + 1);

        break;

  if (matchedBoundary) {

    if (m_base64part && m_partIsText) {

      Base64Decode(buf);

      // Work on the parsed string

      if (!buf.Length()) {

        NS_WARNING("Trying to transform an empty buffer");

        eatThisLine = true;

      } else {

        // Avoid spurious failures

        eatThisLine = false;

    } else if (!m_partIsHtml) {

      buf.Truncate();

      eatThisLine = true;  // We have no content...

    if (m_partIsHtml) {

      StripHtml(buf);

    // Reset all assumed headers

    m_pastPartHeaders = false;

    m_base64part = false;

    m_partIsHtml = false;

    // If we ever see a multipart message, each part needs to set

    // 'm_partIsText', so no more defaulting to 'true' when the part is done.

    m_partIsText = false;

    // Note: we cannot reset 'm_partIsQP' yet since we still need it to process

    // the last buffer returned here. Parsing the next part will set a new

    // value.

    return buf.Length();

  if (!m_partIsText) {

    // Ignore non-text parts

    buf.Truncate();

    eatThisLine = true;

    return 0;

  // Accumulate base64 parts and HTML parts for later decoding or tag stripping.

  if (m_base64part || m_partIsHtml) {

    if (m_partIsHtml && !m_base64part) {

      size_t bufLength = buf.Length();

      if (!m_partIsQP || bufLength == 0 || !StringEndsWith(buf, "="_ns)) {

        // Replace newline in HTML with a space.

        buf.Append(' ');

      } else {

        // Strip the soft line break.

        buf.SetLength(bufLength - 1);

    buf.Append(line);

    eatThisLine = true;

    return buf.Length();

  buf.Assign(line);

  return buf.Length();

void nsMsgBodyHandler::StripHtml(nsCString& pBufInOut) {

  char* pBuf = (char*)PR_Malloc(pBufInOut.Length() + 1);

  if (pBuf) {

    char* pWalk = pBuf;

    char* pWalkInOut = (char*)pBufInOut.get();

    bool inTag = false;

    while (*pWalkInOut)  // throw away everything inside < >

      if (!inTag) {

        if (*pWalkInOut == '<')

          inTag = true;

        else

          *pWalk++ = *pWalkInOut;

      } else {

        if (*pWalkInOut == '>') inTag = false;

      pWalkInOut++;

    *pWalk = 0;  // null terminator

    pBufInOut.Adopt(pBuf);

/**

 * Determines the MIME type, if present, from the current line.

 * m_partIsHtml, m_isMultipart, m_partIsText, m_base64part, and boundary are

 * all set by this method at various points in time.

 * @param line        (in)    a header line that may contain a MIME header

*/

void nsMsgBodyHandler::SniffPossibleMIMEHeader(const nsCString& line) {

  // Some parts of MIME are case-sensitive and other parts are case-insensitive;

  // specifically, the headers are all case-insensitive and the values we care

  // about are also case-insensitive, with the sole exception of the boundary

  // string, so we can't just take the input line and make it lower case.

  nsCString lowerCaseLine(line);

  ToLowerCase(lowerCaseLine);

  if (StringBeginsWith(lowerCaseLine, "content-transfer-encoding:"_ns))

    m_partIsQP = lowerCaseLine.Find("quoted-printable") != kNotFound;

  if (StringBeginsWith(lowerCaseLine, "content-type:"_ns)) {

    if (lowerCaseLine.LowerCaseFindASCII("text/html") != kNotFound) {

      m_partIsText = true;

      m_partIsHtml = true;

    } else if (lowerCaseLine.Find("multipart/") != kNotFound) {

      if (m_isMultipart) {

        // Nested multipart, get ready for new headers.

        m_base64part = false;

        m_partIsQP = false;

        m_pastPartHeaders = false;

        m_partIsHtml = false;

        m_partIsText = false;

      m_isMultipart = true;

      m_partCharset.Truncate();

    } else if (lowerCaseLine.Find("message/") != kNotFound) {

      // Initialise again.

      m_base64part = false;

      m_partIsQP = false;

      m_pastPartHeaders = false;

      m_partIsHtml = false;

      m_partIsText =

          true;  // Default is text/plain, maybe proven otherwise later.

      m_inMessageAttachment = true;

    } else if (lowerCaseLine.Find("text/") != kNotFound)

      m_partIsText = true;

    else if (lowerCaseLine.Find("text/") == kNotFound)

      m_partIsText = false;  // We have disproven our assumption.

  int32_t start;

  if (m_isMultipart && (start = lowerCaseLine.Find("boundary=")) != kNotFound) {

    start += 9;  // strlen("boundary=")

    if (line[start] == '\"') start++;

    int32_t end = line.RFindChar('\"');

    if (end == -1) end = line.Length();

    // Collect all boundaries. Since we only react to crossing a boundary,

    // we can simply collect the boundaries instead of forming a tree

    // structure from the message. Keep it simple ;-)

    nsCString boundary;

    boundary.AssignLiteral("--");

    boundary.Append(Substring(line, start, end - start));

    if (!m_boundaries.Contains(boundary)) m_boundaries.AppendElement(boundary);

  if (m_isMultipart && (start = lowerCaseLine.Find("charset=")) != kNotFound) {

    start += 8;  // strlen("charset=")

    bool foundQuote = false;

    if (line[start] == '\"') {

      start++;

      foundQuote = true;

    int32_t end = line.FindChar(foundQuote ? '\"' : ';', start);

    if (end == -1) end = line.Length();

    m_partCharset.Assign(Substring(line, start, end - start));

  if (StringBeginsWith(lowerCaseLine, "content-transfer-encoding:"_ns) &&

      lowerCaseLine.LowerCaseFindASCII(ENCODING_BASE64) != kNotFound)

    m_base64part = true;

/**

 * Decodes the given base64 string.

 * It returns its decoded string in its input.

 * @param pBufInOut   (inout) a buffer of the string

*/

void nsMsgBodyHandler::Base64Decode(nsCString& pBufInOut) {

  char* decodedBody =

      PL_Base64Decode(pBufInOut.get(), pBufInOut.Length(), nullptr);

  if (decodedBody) {

    // Replace CR LF with spaces.

    char* q = decodedBody;

    while (*q) {

      if (*q == '\n' || *q == '\r') *q = ' ';

      q++;

    pBufInOut.Adopt(decodedBody);