rtfDecoder.cpp - mozsearch

comm-central/mailnews/import/src/rtfDecoder.cpp

Enable keyboard shortcuts

Source code

Go to header file

Revision control

Copy as Markdown

Other Tools

HG Web

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <stack>

#include <map>

#include <sstream>

#include "windows.h"

#include "rtfDecoder.h"

#define SIZEOF(x) (sizeof(x) / sizeof((x)[0]))

#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9')

#define IS_ALPHA(VAL) \

  (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z'))

inline int HexToInt(char ch) {

  switch (ch) {

    case '0':

    case '1':

    case '2':

    case '3':

    case '4':

    case '5':

    case '6':

    case '7':

    case '8':

    case '9':

      return ch - '0';

    case 'A':

    case 'B':

    case 'C':

    case 'D':

    case 'E':

    case 'F':

      return ch - 'A' + 10;

    case 'a':

    case 'b':

    case 'c':

    case 'd':

    case 'e':

    case 'f':

      return ch - 'a' + 10;

    default:

      return 0;

inline int CharsetToCP(int charset) {

  // We don't know the Code page for the commented out charsets.

  switch (charset) {

    case 0:

      return 1252;  // ANSI

    case 1:

      return 0;  // Default

                 // case 2: return 42; // Symbol

    case 2:

      return 1252;  // Symbol

    case 77:

      return 10000;  // Mac Roman

    case 78:

      return 10001;  // Mac Shift Jis

    case 79:

      return 10003;  // Mac Hangul

    case 80:

      return 10008;  // Mac GB2312

    case 81:

      return 10002;  // Mac Big5

                     // case 82: Mac Johab (old)

    case 83:

      return 10005;  // Mac Hebrew

    case 84:

      return 10004;  // Mac Arabic

    case 85:

      return 10006;  // Mac Greek

    case 86:

      return 10081;  // Mac Turkish

    case 87:

      return 10021;  // Mac Thai

    case 88:

      return 10029;  // Mac East Europe

    case 89:

      return 10007;  // Mac Russian

    case 128:

      return 932;  // Shift JIS

    case 129:

      return 949;  // Hangul

    case 130:

      return 1361;  // Johab

    case 134:

      return 936;  // GB2312

    case 136:

      return 950;  // Big5

    case 161:

      return 1253;  // Greek

    case 162:

      return 1254;  // Turkish

    case 163:

      return 1258;  // Vietnamese

    case 177:

      return 1255;  // Hebrew

    case 178:

      return 1256;  // Arabic

                    // case 179: Arabic Traditional (old)

      // case 180: Arabic user (old)

      // case 181: Hebrew user (old)

    case 186:

      return 1257;  // Baltic

    case 204:

      return 1251;  // Russian

    case 222:

      return 874;  // Thai

    case 238:

      return 1250;  // Eastern European

    case 254:

      return 437;  // PC 437

    case 255:

      return 850;  // OEM

    default:

      return CP_ACP;

struct FontInfo {

  enum Options { has_fcharset = 0x0001, has_cpg = 0x0002 };

  unsigned int options;

  int fcharset;

  unsigned int cpg;

  FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}

  unsigned int Codepage() {

    if (options & has_cpg)

      return cpg;

    else if (options & has_fcharset)

      return CharsetToCP(fcharset);

    else

      return 0xFFFFFFFF;

};

typedef std::map<int, FontInfo> Fonttbl;

struct LocalState {

  bool fonttbl;  // When fonts are being defined

  int f;  // Index of the font being defined/used; defines the codepage if no

          // \cpg

  unsigned int uc;        // ucN keyword value; its default is 1

  unsigned int codepage;  // defined by \cpg

};

typedef std::stack<LocalState> StateStack;

struct GlobalState {

  enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };

  std::istream& stream;

  Fonttbl fonttbl;

  StateStack stack;

  unsigned int codepage;  // defined by \ansi, \mac, \pc, \pca, and \ansicpgN

  int deff;

  std::stringstream pcdata_a;

  unsigned int pcdata_a_codepage;

  Pcdata_state pcdata_a_state;

  explicit GlobalState(std::istream& s)

      : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno) {

    LocalState st;

    st.fonttbl = false;

    st.f = -1;

    st.uc = 1;

    st.codepage = 0xFFFFFFFF;

    stack.push(st);

  unsigned int GetCurrentCP() {

    if (stack.top().codepage != 0xFFFFFFFF)  // \cpg in use

      return stack.top().codepage;

    // \cpg not used; use font settings

    int f = (stack.top().f != -1) ? stack.top().f : deff;

    if (f != -1) {

      Fonttbl::iterator iter = fonttbl.find(f);

      if (iter != fonttbl.end()) {

        unsigned int cp = iter->second.Codepage();

        if (cp != 0xFFFFFFFF) return cp;

    return codepage;  // No overrides; use the top-level legacy setting

};

struct Keyword {

  char name[33];

  bool hasVal;

  int val;

};

class Lexem {

 public:

  enum Type {

    ltGroupBegin,

    ltGroupEnd,

    ltKeyword,

    ltPCDATA_A,

    ltPCDATA_W,

    ltBDATA,

    ltEOF,

    ltError

};

  explicit Lexem(Type t = ltError) : m_type(t) {}

  Lexem(Lexem& from) {

    switch (m_type = from.m_type) {

      case ltKeyword:

        m_keyword = from.m_keyword;

        break;

      case ltPCDATA_A:

        m_pcdata_a = from.m_pcdata_a;

        break;

      case ltPCDATA_W:

        m_pcdata_w = from.m_pcdata_w;

        break;

      case ltBDATA:

        m_bdata = from.m_bdata;  // Move pointers when copying.

        from.m_type = ltError;   // Invalidate the original. Not nice.

        break;

  ~Lexem() { Clear(); }

  Lexem& operator=(Lexem& from) {

    if (&from != this) {

      Clear();

      switch (m_type = from.m_type) {

        case ltKeyword:

          m_keyword = from.m_keyword;

          break;

        case ltPCDATA_A:

          m_pcdata_a = from.m_pcdata_a;

          break;

        case ltPCDATA_W:

          m_pcdata_w = from.m_pcdata_w;

          break;

        case ltBDATA:

          m_bdata = from.m_bdata;  // Move pointers when copying.

          from.m_type = ltError;   // Invalidate the original. Not nice.

          break;

    return *this;

  Type type() const { return m_type; }

  void SetPCDATA_A(char chdata) {

    Clear();

    m_pcdata_a = chdata;

    m_type = ltPCDATA_A;

  void SetPCDATA_W(wchar_t chdata) {

    Clear();

    m_pcdata_w = chdata;

    m_type = ltPCDATA_W;

  void SetBDATA(const char* data, int sz) {

    char* tmp = new char[sz];  // to allow getting the data from itself

    if (tmp) {

      memcpy(tmp, data, sz);

      Clear();

      m_bdata.data = tmp;

      m_bdata.sz = sz;

      m_type = ltBDATA;

    } else

      m_type = ltError;

  void SetKeyword(const Keyword& src) {

    Clear();

    m_type = ltKeyword;

    m_keyword = src;

  void SetKeyword(const char* name, bool hasVal = false, int val = 0) {

    char tmp[SIZEOF(m_keyword.name)];

    strncpy(tmp, name,

            SIZEOF(m_keyword.name) - 1);  // to allow copy drom itself

    tmp[SIZEOF(m_keyword.name) - 1] = 0;

    Clear();

    m_type = ltKeyword;

    memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));

    m_keyword.hasVal = hasVal;

    m_keyword.val = val;

  const char* KeywordName() const {

    return (m_type == ltKeyword) ? m_keyword.name : 0;

  const int* KeywordVal() const {

    return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0;

  char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }

  wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }

  const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }

  int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }

  static Lexem eof;

  static Lexem groupBegin;

  static Lexem groupEnd;

  static Lexem error;

 private:

  struct BDATA {

    size_t sz;

    char* data;

};

  Type m_type;

  union {

    Keyword m_keyword;

    char m_pcdata_a;

    wchar_t m_pcdata_w;

    BDATA m_bdata;

};

  // This function leaves the object in the broken state. Must be followed

  // by a correct initialization.

  void Clear() {

    switch (m_type) {

      case ltBDATA:

        delete[] m_bdata.data;

        break;

    //  m_type = ltError;

};

Lexem Lexem::eof(ltEOF);

Lexem Lexem::groupBegin(ltGroupBegin);

Lexem Lexem::groupEnd(ltGroupEnd);

Lexem Lexem::error(ltError);

// This function moves pos. When calling the function, pos must be next to the

// backslash; pos must be in the same sequence and before end!

Keyword GetKeyword(std::istream& stream) {

  Keyword keyword = {"", false, 0};

  char ch;

  if (stream.get(ch).eof()) return keyword;

  // Control word; maybe delimiter and value

  if (IS_ALPHA(ch)) {

    int i = 0;

    do {

      // We take up to 32 characters into account, skipping over extra

      // characters (allowing for some non-conformant implementation).

      if (i < 32) keyword.name[i++] = ch;

    } while (!stream.get(ch).eof() && IS_ALPHA(ch));

    keyword.name[i] = 0;                                   // NULL-terminating

    if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) {  // Value begin

      keyword.hasVal = true;

      bool negative = (ch == '-');

      if (negative) stream.get(ch);

      i = 0;

      while (!stream.eof() && IS_DIGIT(ch)) {

        // We take into account only 10 digits, skip other. Older specs stated

        // that we must be ready for an arbitrary number of digits.

        if (i++ < 10) keyword.val = keyword.val * 10 + (ch - '0');

        stream.get(ch);

      if (negative) keyword.val = -keyword.val;

    // End of control word; the space is just a delimiter - skip it

    if (!stream.eof() && !(ch == ' ')) stream.unget();

  } else {  // Control symbol

    keyword.name[0] = ch;

    keyword.name[1] = 0;

  return keyword;

void GetLexem(std::istream& stream, Lexem& result) {

  // We always stay at the beginning of the next lexem or a crlf

  // If it's a brace then it's group begin/end

  // If it's a backslash -> Preprocess

  // - if it's a \u or \' -> make UTF16 character

  // - else it's a keyword -> Process (e.g., remember the codepage)

  // - (if the keyword is \bin then the following is #BDATA)

  // If it's some other character -> Preprocess

  // - if it's 0x09 -> it's the keyword \tab

  // - else it's a PCDATA

  char ch;

  while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r')))

    ;  // Skip crlf

  if (stream.eof())

    result = Lexem::eof;

  else {

    switch (ch) {

      case '{':  // Group begin

      case '}':  // Group end

        result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;

        break;

      case '\\':  // Keyword

        result.SetKeyword(GetKeyword(stream));

        break;

      case '\t':  // tab

        result.SetKeyword("tab");

        break;

      default:  // PSDATA?

        result.SetPCDATA_A(ch);

        break;

void PreprocessLexem(/*inout*/ Lexem& lexem, std::istream& stream, int uc) {

  if (lexem.type() == Lexem::ltKeyword) {

    if (lexem.KeywordName()[0] == 0)  // Empty keyword - maybe eof?

      lexem = Lexem::error;

    else if (eq(lexem.KeywordName(), "u")) {

      // Unicode character - get the UTF16 and skip the uc characters

      if (const int* val = lexem.KeywordVal()) {

        lexem.SetPCDATA_W(*val);

        stream.ignore(uc);

      } else

        lexem = Lexem::error;

    } else if (eq(lexem.KeywordName(), "'")) {

      // 8-bit character (\'hh) -> use current codepage

      char ch = 0, ch1 = 0;

      if (!stream.get(ch).eof()) ch1 = HexToInt(ch);

      if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);

      lexem.SetPCDATA_A(ch1);

    } else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||

               eq(lexem.KeywordName(), "}"))  // escaped characters

      lexem.SetPCDATA_A(lexem.KeywordName()[0]);

    else if (eq(lexem.KeywordName(), "bin")) {

      if (const int* i = lexem.KeywordVal()) {

        char* data = new char[*i];

        if (data) {

          stream.read(data, *i);

          if (stream.fail())

            lexem = Lexem::error;

          else

            lexem.SetBDATA(data, *i);

          delete[] data;

        } else

          lexem = Lexem::error;

      } else

        lexem = Lexem::error;

    } else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {

      // escaped cr or lf

      lexem.SetKeyword("par");

void UpdateState(const Lexem& lexem, /*inout*/ GlobalState& globalState) {

  switch (globalState.pcdata_a_state) {

    case GlobalState::pcdsfinished:  // Last time we finished the pcdata

      globalState.pcdata_a_state = GlobalState::pcdsno;

      break;

    case GlobalState::pcdsin:

      // to be reset later if still in the pcdata

      globalState.pcdata_a_state = GlobalState::pcdsfinished;

      break;

  switch (lexem.type()) {

    case Lexem::ltGroupBegin:

      globalState.stack.push(globalState.stack.top());

      break;

    case Lexem::ltGroupEnd:

      globalState.stack.pop();

      break;

    case Lexem::ltKeyword: {

      const int* val = lexem.KeywordVal();

      if (eq(lexem.KeywordName(), "ansi"))

        globalState.codepage = CP_ACP;

      else if (eq(lexem.KeywordName(), "mac"))

        globalState.codepage = CP_MACCP;

      else if (eq(lexem.KeywordName(), "pc"))

        globalState.codepage = 437;

      else if (eq(lexem.KeywordName(), "pca"))

        globalState.codepage = 850;

      else if (eq(lexem.KeywordName(), "ansicpg") && val)

        globalState.codepage = static_cast<unsigned int>(*val);

      else if (eq(lexem.KeywordName(), "deff") && val)

        globalState.deff = *val;

      else if (eq(lexem.KeywordName(), "fonttbl"))

        globalState.stack.top().fonttbl = true;

      else if (eq(lexem.KeywordName(), "f") && val) {

        globalState.stack.top().f = *val;

      } else if (eq(lexem.KeywordName(), "fcharset") &&

                 globalState.stack.top().fonttbl &&

                 (globalState.stack.top().f != -1) && val) {

        FontInfo& f = globalState.fonttbl[globalState.stack.top().f];

        f.options |= FontInfo::has_fcharset;

        f.fcharset = *val;

      } else if (eq(lexem.KeywordName(), "cpg") && val) {

        if (globalState.stack.top().fonttbl &&

            (globalState.stack.top().f != -1)) {  // Defining a font

          FontInfo& f = globalState.fonttbl[globalState.stack.top().f];

          f.options |= FontInfo::has_cpg;

          f.cpg = *val;

        } else {  // Overriding the codepage for the block - may be in filenames

          globalState.stack.top().codepage = *val;

      } else if (eq(lexem.KeywordName(), "plain"))

        globalState.stack.top().f = -1;

      else if (eq(lexem.KeywordName(), "uc") && val)

        globalState.stack.top().uc = *val;

    } break;

    case Lexem::ltPCDATA_A:

      if (globalState.pcdata_a_state ==

          GlobalState::pcdsno)  // Beginning of the pcdata

        globalState.pcdata_a_codepage =

            globalState.GetCurrentCP();  // to use later to convert to utf16

      globalState.pcdata_a_state = GlobalState::pcdsin;

      globalState.pcdata_a << lexem.pcdata_a();

      break;

void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder) {

  // Check if this is the rtf

  Lexem lexem;

  GetLexem(rtf, lexem);

  if (lexem.type() != Lexem::ltGroupBegin) return;

  decoder.BeginGroup();

  GetLexem(rtf, lexem);

  if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||

      !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))

    return;

  decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());

  GlobalState state(rtf);

  // Level is the count of elements in the stack

  while (!state.stream.eof() &&

         (state.stack.size() > 0)) {  // Don't go past the global group

    GetLexem(state.stream, lexem);

    PreprocessLexem(lexem, state.stream, state.stack.top().uc);

    UpdateState(lexem, state);

    if (state.pcdata_a_state == GlobalState::pcdsfinished) {

      std::string s = state.pcdata_a.str();

      int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(),

                                     s.size(), 0, 0);

      if (sz) {

        wchar_t* data = new wchar_t[sz];

        ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(),

                              data, sz);

        decoder.PCDATA(data, sz);

        delete[] data;

      state.pcdata_a.str("");  // reset

    switch (lexem.type()) {

      case Lexem::ltGroupBegin:

        decoder.BeginGroup();

        break;

      case Lexem::ltGroupEnd:

        decoder.EndGroup();

        break;

      case Lexem::ltKeyword:

        decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());

        break;

      case Lexem::ltPCDATA_W: {

        wchar_t ch = lexem.pcdata_w();

        decoder.PCDATA(&ch, 1);

      } break;

      case Lexem::ltBDATA:

        decoder.BDATA(lexem.bdata(), lexem.bdata_sz());

        break;

      case Lexem::ltError:

        break;  // Just silently skip the erroneous data - basic error recovery

  }  // while

}  // DecodeRTF