getonescriptspan.h

// Copyright 2013 Google Inc. All Rights Reserved.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//     http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//

// Author: dsites@google.com (Dick Sites)

//

#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

#include "integral_types.h"

#include "langspan.h"

#include "offsetmap.h"

namespace CLD2 {

static const int kMaxScriptBuffer = 40960;

static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;

static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room

static const int kWithinScriptTail = 32;    // Stop at word space in last

                                            // N bytes of script buffer

static inline bool IsContinuationByte(char c) {

  return static_cast<signed char>(c) < -64;

// Gets lscript number for letters; always returns

//   0 (common script) for non-letters

int GetUTF8LetterScriptNum(const char* src);

// Update src pointer to point to next quadgram, +2..+5

// Looks at src[0..4]

const char* AdvanceQuad(const char* src);

class ScriptScanner {

 public:

  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);

  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,

                bool any_text, bool any_script);

  ~ScriptScanner();

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]

  bool GetOneScriptSpan(LangSpan* span);

  // Force Latin and Cyrillic scripts to be lowercase

  void LowerScriptSpan(LangSpan* span);

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]

  // Force Latin and Cyrillic scripts to be lowercase

  bool GetOneScriptSpanLower(LangSpan* span);

  // Copy next run of non-tag characters to buffer [NUL terminated]

  // This just removes tags and removes entities

  // Buffer has leading space

  bool GetOneTextSpan(LangSpan* span);

  // Maps byte offset in most recent GetOneScriptSpan/Lower

  // span->text [0..text_bytes] into an additional byte offset from

  // span->offset, to get back to corresponding text in the original

  // input buffer.

  // text_offset must be the first byte

  // of a UTF-8 character, or just beyond the last character. Normally this

  // routine is called with the first byte of an interesting range and

  // again with the first byte of the following range.

  int MapBack(int text_offset);

  const char* GetBufferStart() {return start_byte_;};

 private:

  // Skip over tags and non-letters

  int SkipToFrontOfSpan(const char* src, int len, int* script);

  const char* start_byte_;        // Starting byte of buffer to scan

  const char* next_byte_;         // First unscanned byte

  const char* next_byte_limit_;   // Last byte + 1

  int byte_length_;               // Bytes left: next_byte_limit_ - next_byte_

  bool is_plain_text_;            // true fo text, false for HTML

  char* script_buffer_;           // Holds text with expanded entities

  char* script_buffer_lower_;     // Holds lowercased text

  bool letters_marks_only_;       // To distinguish scriptspan of one

                                  // letters/marks vs. any mixture of text

  bool one_script_only_;          // To distinguish scriptspan of one

                                  // script vs. any mixture of scripts

  int exit_state_;                // For tag parser kTagParseTbl_0, based

                                  // on letters_marks_only_

 public :

  // Expose for debugging

  OffsetMap map2original_;    // map from script_buffer_ to buffer

  OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_

};

}  // namespace CLD2

#endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_