scoreonescriptspan.cc

// Copyright 2013 Google Inc. All Rights Reserved.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//     http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//

// Author: dsites@google.com (Dick Sites)

// Updated 2014.01 for dual table lookup

//

#include "scoreonescriptspan.h"

#include "cldutil.h"

#include "debug.h"

#include "lang_script.h"

#include <stdio.h>

using namespace std;

namespace CLD2 {

static const int kUnreliablePercentThreshold = 75;

void AddLangProb(uint32 langprob, Tote* chunk_tote) {

  ProcessProbV2Tote(langprob, chunk_tote);

void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {

  uint8 top1 = (langprob >> 8) & 0xff;

  chunk_tote->SetScore(top1, 0);

bool SameCloseSet(uint16 lang1, uint16 lang2) {

  int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));

  if (lang1_close_set == 0) {return false;}

  int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));

  return (lang1_close_set == lang2_close_set);

bool SameCloseSet(Language lang1, Language lang2) {

  int lang1_close_set = LanguageCloseSet(lang1);

  if (lang1_close_set == 0) {return false;}

  int lang2_close_set = LanguageCloseSet(lang2);

  return (lang1_close_set == lang2_close_set);

// Needs expected score per 1KB in scoring context

void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,

                     int offset, int len,

                     const ScoringContext* scoringcontext,

                     const Tote* chunk_tote,

                     ChunkSummary* chunksummary) {

  int key3[3];

  chunk_tote->CurrentTopThreeKeys(key3);

  Language lang1 = FromPerScriptNumber(ulscript, key3[0]);

  Language lang2 = FromPerScriptNumber(ulscript, key3[1]);

  int actual_score_per_kb = 0;

  if (len > 0) {

    actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;

  int expected_subscr = lang1 * 4 + LScript4(ulscript);

  int expected_score_per_kb =

     scoringcontext->scoringtables->kExpectedScore[expected_subscr];

  chunksummary->offset = offset;

  chunksummary->chunk_start = first_linear_in_chunk;

  chunksummary->lang1 = lang1;

  chunksummary->lang2 = lang2;

  chunksummary->score1 = chunk_tote->GetScore(key3[0]);

  chunksummary->score2 = chunk_tote->GetScore(key3[1]);

  chunksummary->bytes = len;

  chunksummary->grams = chunk_tote->GetScoreCount();

  chunksummary->ulscript = ulscript;

  chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,

                                                     chunksummary->score2,

                                                     chunksummary->grams);

  // If lang1/lang2 in same close set, set delta reliability to 100%

  if (SameCloseSet(lang1, lang2)) {

    chunksummary->reliability_delta = 100;

  chunksummary->reliability_score =

     ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);

// Return true if just lang1 is there: lang2=0 and lang3=0

bool IsSingleLang(uint32 langprob) {

  // Probably a bug -- which end is lang1? But only used to call empty Boost1

  return ((langprob & 0x00ffff00) == 0);

// Update scoring context distinct_boost for single language quad

void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {

  // Probably keep this empty -- not a good enough signal

// Update scoring context distinct_boost for distinct octagram

// Keep last 4 used. Since these are mostly (except at splices) in

// hitbuffer, we might be able to just use a subscript and splice

void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {

// this is called 0..n times per chunk with decoded hitbuffer->distinct...

  LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;

  if (scoringcontext->ulscript != ULScript_Latin) {

    distinct_boost = &scoringcontext->distinct_boost.othr;

  int n = distinct_boost->n;

  distinct_boost->langprob[n] = langprob;

  distinct_boost->n = distinct_boost->wrap(n + 1);

// For each chunk, add extra weight for language priors (from content-lang and

// meta lang=xx) and distinctive tokens

void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {

  // Get boosts for current script

  const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;

  const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;

  const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;

  if (scoringcontext->ulscript != ULScript_Latin) {

    langprior_boost = &scoringcontext->langprior_boost.othr;

    langprior_whack = &scoringcontext->langprior_whack.othr;

    distinct_boost = &scoringcontext->distinct_boost.othr;

  for (int k = 0; k < kMaxBoosts; ++k) {

    uint32 langprob = langprior_boost->langprob[k];

    if (langprob > 0) {AddLangProb(langprob, chunk_tote);}

  for (int k = 0; k < kMaxBoosts; ++k) {

    uint32 langprob = distinct_boost->langprob[k];

    if (langprob > 0) {AddLangProb(langprob, chunk_tote);}

  // boost has a packed set of per-script langs and probabilites

  // whack has a packed set of per-script lang to be suppressed (zeroed)

  // When a language in a close set is given as an explicit hint, others in

  //  that set will be whacked here.

  for (int k = 0; k < kMaxBoosts; ++k) {

    uint32 langprob = langprior_whack->langprob[k];

    if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}

// At this point, The chunk is described by

//  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)

//  hitbuffer->delta[cspan->chunk_delta ... )

//  hitbuffer->distinct[cspan->chunk_distinct ... )

// Scored text is in text[lo..hi) where

//  lo is 0 or the min of first base/delta/distinct hitbuffer offset and

//  hi is the min of next base/delta/distinct hitbuffer offset after

//  base_len, etc.

void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,

                        const ChunkSpan* cspan, int* lo, int* hi) {

  // Front of this span

  int lo_base = hitbuffer->base[cspan->chunk_base].offset;

  int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;

  int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;

  // Front of next span

  int hi_base = hitbuffer->base[cspan->chunk_base +

    cspan->base_len].offset;

  int hi_delta = hitbuffer->delta[cspan->chunk_delta +

    cspan->delta_len].offset;

  int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +

    cspan->distinct_len].offset;

  *lo = 0;

//  if (cspan->chunk_base > 0) {

//    *lo = minint(minint(lo_base, lo_delta), lo_distinct);

//  }

  *lo = minint(minint(lo_base, lo_delta), lo_distinct);

  *hi = minint(minint(hi_base, hi_delta), hi_distinct);

int DiffScore(const CLD2TableSummary* obj, int indirect,

              uint16 lang1, uint16 lang2) {

  if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {

    // Up to three languages at indirect

    uint32 langprob = obj->kCLDTableInd[indirect];

    return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);

  } else {

    // Up to six languages at start + 2 * (indirect - start)

    indirect += (indirect - obj->kCLDTableSizeOne);

    uint32 langprob = obj->kCLDTableInd[indirect];

    uint32 langprob2 = obj->kCLDTableInd[indirect + 1];

    return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -

      (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));

// Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote

// After last chunk there is always a hitbuffer entry with an offset just off

// the end of the text.

// Sets delta_len, and distinct_len

void ScoreOneChunk(const char* text, ULScript ulscript,

                   const ScoringHitBuffer* hitbuffer,

                   int chunk_i,

                   ScoringContext* scoringcontext,

                   ChunkSpan* cspan, Tote* chunk_tote,

                   ChunkSummary* chunksummary) {

  int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];

  int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];

  chunk_tote->Reinit();

  cspan->delta_len = 0;

  cspan->distinct_len = 0;

  if (scoringcontext->flags_cld2_verbose) {

    fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",

            first_linear_in_chunk, first_linear_in_next_chunk);

  // 2013.02.05 linear design: just use base and base_len for the span

  cspan->chunk_base = first_linear_in_chunk;

  cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;

  for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {

    uint32 langprob = hitbuffer->linear[i].langprob;

    AddLangProb(langprob, chunk_tote);

    if (hitbuffer->linear[i].type <= QUADHIT) {

      chunk_tote->AddScoreCount();      // Just count quads, not octas

    if (hitbuffer->linear[i].type == DISTINCTHIT) {

      AddDistinctBoost2(langprob, scoringcontext);

  // Score language prior boosts

  // Score distinct word boost

  ScoreBoosts(scoringcontext, chunk_tote);

  int lo = hitbuffer->linear[first_linear_in_chunk].offset;

  int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;

  // Chunk_tote: get top langs, scores, etc. and fill in chunk summary

  SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,

                  scoringcontext, chunk_tote, chunksummary);

  bool more_to_come = false;

  bool score_cjk = false;

  if (scoringcontext->flags_cld2_html) {

    // Show one chunk in readable output

    CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,

               scoringcontext, cspan, chunksummary);

  scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);

// Score chunks of text described by hitbuffer, allowing each to be in a

// different language, and optionally adjusting the boundaries inbetween.

// Set last_cspan to the last chunkspan used

void ScoreAllHits(const char* text,  ULScript ulscript,

                  bool more_to_come, bool score_cjk,

                  const ScoringHitBuffer* hitbuffer,

                  ScoringContext* scoringcontext,

                  SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {

  ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};

  ChunkSpan cspan = {0, 0, 0, 0, 0, 0};

  for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {

    // Score one chunk

    // Sets delta_len, and distinct_len

    Tote chunk_tote;

    ChunkSummary chunksummary;

    ScoreOneChunk(text, ulscript,

                  hitbuffer, i,

                  scoringcontext, &cspan, &chunk_tote, &chunksummary);

    // Put result in summarybuffer

    if (summarybuffer->n < kMaxSummaries) {

      summarybuffer->chunksummary[summarybuffer->n] = chunksummary;

      summarybuffer->n += 1;

    prior_cspan = cspan;

    cspan.chunk_base += cspan.base_len;

    cspan.chunk_delta += cspan.delta_len;

    cspan.chunk_distinct += cspan.distinct_len;

  // Add one dummy off the end to hold first unused linear_in_chunk

  int linear_off_end = hitbuffer->next_linear;

  int offset_off_end = hitbuffer->linear[linear_off_end].offset;

  ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];

  memset(cs, 0, sizeof(ChunkSummary));

  cs->offset = offset_off_end;

  cs->chunk_start = linear_off_end;

  *last_cspan = prior_cspan;

void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,

                            bool more_to_come, DocTote* doc_tote) {

  int cs_bytes_sum = 0;

  for (int i = 0; i < summarybuffer->n; ++i) {

    const ChunkSummary* cs = &summarybuffer->chunksummary[i];

    int reliability = minint(cs->reliability_delta, cs->reliability_score);

    // doc_tote uses full languages

    doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);

    cs_bytes_sum += cs->bytes;

// Turn on for debugging vectors

static const bool kShowLettersOriginal = false;

// If next chunk language matches last vector language, extend last element

// Otherwise add new element to vector

void ItemToVector(ScriptScanner* scanner,

                  ResultChunkVector* vec, Language new_lang,

                  int mapped_offset, int mapped_len) {

  uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);

  int last_vec_subscr = vec->size() - 1;

  if (last_vec_subscr >= 0) {

    ResultChunk* priorrc = &(*vec)[last_vec_subscr];

    last_vec_lang = priorrc->lang1;

    if (new_lang == last_vec_lang) {

      // Extend prior. Current mapped_offset may be beyond prior end, so do

      // the arithmetic to include any such gap

      priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,

                              kMaxResultChunkBytes);

      if (kShowLettersOriginal) {

        // Optionally print the new chunk original text

        string temp2(&scanner->GetBufferStart()[priorrc->offset],

                     priorrc->bytes);

        fprintf(stderr, "Item[%d..%d) '%s'<br>\n",

                priorrc->offset, priorrc->offset + priorrc->bytes,

                GetHtmlEscapedText(temp2).c_str());

      return;

  // Add new vector element

  ResultChunk rc;

  rc.offset = mapped_offset;

  rc.bytes = minint(mapped_len, kMaxResultChunkBytes);

  rc.lang1 = static_cast<uint16>(new_lang);

  vec->push_back(rc);

  if (kShowLettersOriginal) {

    // Optionally print the new chunk original text

    string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);

    fprintf(stderr, "Item[%d..%d) '%s'<br>\n",

            rc.offset, rc.offset + rc.bytes,

            GetHtmlEscapedText(temp2).c_str());

uint16 PriorVecLang(const ResultChunkVector* vec) {

  if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}

  return (*vec)[vec->size() - 1].lang1;

uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {

  if ((i + 1) >= summarybuffer->n) {

    return static_cast<uint16>(UNKNOWN_LANGUAGE);

  return summarybuffer->chunksummary[i + 1].lang1;

// Add n elements of summarybuffer to resultchunk vector:

// Each element is letters-only text [offset..offset+bytes)

// This maps back to original[Back(offset)..Back(offset+bytes))

//

// We go out of our way to minimize the variation in the ResultChunkVector,

// so that the caller has fewer but more meaningful spans in different

// lanaguges, for the likely purpose of translation or spell-check.

//

// The language of each chunk is lang1, but it might be unreliable for

// either of two reasons: its score is relatively too close to the score of

// lang2, or its score is too far away from the expected score of real text in

// the given language. Unreliable languages are mapped to Unknown.

//

void SummaryBufferToVector(ScriptScanner* scanner, const char* text,

                           const SummaryBuffer* summarybuffer,

                           bool more_to_come, ResultChunkVector* vec) {

  if (vec == NULL) {return;}

  if (kShowLettersOriginal) {

    fprintf(stderr, "map2original_ ");

    scanner->map2original_.DumpWindow();

    fprintf(stderr, "<br>\n");

    fprintf(stderr, "map2uplow_ ");

    scanner->map2uplow_.DumpWindow();

    fprintf(stderr, "<br>\n");

  for (int i = 0; i < summarybuffer->n; ++i) {

    const ChunkSummary* cs = &summarybuffer->chunksummary[i];

    int unmapped_offset = cs->offset;

    int unmapped_len = cs->bytes;

    if (kShowLettersOriginal) {

      // Optionally print the chunk lowercase letters/marks text

      string temp(&text[unmapped_offset], unmapped_len);

      fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",

              unmapped_offset, unmapped_offset + unmapped_len,

              GetHtmlEscapedText(temp).c_str());

    int mapped_offset = scanner->MapBack(unmapped_offset);

    // Trim back a little to prefer splicing original at word boundaries

    if (mapped_offset > 0) {

      // Size of prior vector entry, if any

      int prior_size = 0;

      if (!vec->empty()) {

        ResultChunk* rc = &(*vec)[vec->size() - 1];

        prior_size = rc->bytes;

      // Maximum back up size to leave at least 3 bytes in prior,

      // and not entire buffer, and no more than 12 bytes total backup

      int n_limit = minint(prior_size - 3, mapped_offset);

      n_limit = minint(n_limit, 12);

      // Backscan over letters, stopping if prior byte is < 0x41

      // There is some possibility that we will backscan over a different script

      const char* s = &scanner->GetBufferStart()[mapped_offset];

      const unsigned char* us = reinterpret_cast<const unsigned char*>(s);

      int n = 0;

      while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}

      if (n >= n_limit) {n = 0;} // New boundary not found within range

      // Also back up exactly one leading punctuation character if '"#@

      if (n < n_limit) {

        unsigned char c = us[-n - 1];

        if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}

      // Shrink the previous chunk slightly

      if (n > 0) {

        ResultChunk* rc = &(*vec)[vec->size() - 1];

        rc->bytes -= n;

        mapped_offset -= n;

        if (kShowLettersOriginal) {

          fprintf(stderr, "Back up %d bytes<br>\n", n);

          // Optionally print the prior chunk original text

          string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);

          fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",

                  rc->offset, rc->offset + rc->bytes,

                  GetHtmlEscapedText(temp2).c_str());

    int mapped_len =

      scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;

    if (kShowLettersOriginal) {

      // Optionally print the chunk original text

      string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);

      fprintf(stderr, "Original[%d..%d) '%s'<br>\n",

              mapped_offset, mapped_offset + mapped_len,

              GetHtmlEscapedText(temp2).c_str());

    Language new_lang = static_cast<Language>(cs->lang1);

    bool reliability_delta_bad =

      (cs->reliability_delta < kUnreliablePercentThreshold);

    bool reliability_score_bad =

      (cs->reliability_score < kUnreliablePercentThreshold);

    // If the top language matches last vector, ignore reliability_delta

    uint16 prior_lang = PriorVecLang(vec);

    if (prior_lang == cs->lang1) {

      reliability_delta_bad = false;

    // If the top language is in same close set as last vector, set up to merge

    if (SameCloseSet(cs->lang1, prior_lang)) {

      new_lang = static_cast<Language>(prior_lang);

      reliability_delta_bad = false;

    // If the top two languages are in the same close set and the last vector

    // language is the second language, set up to merge

    if (SameCloseSet(cs->lang1, cs->lang2) &&

        (prior_lang == cs->lang2)) {

      new_lang = static_cast<Language>(prior_lang);

      reliability_delta_bad = false;

    // If unreliable and the last and next vector languages are both

    // the second language, set up to merge

    uint16 next_lang = NextChunkLang(summarybuffer, i);

    if (reliability_delta_bad &&

        (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {

      new_lang = static_cast<Language>(prior_lang);

      reliability_delta_bad = false;

    if (reliability_delta_bad || reliability_score_bad) {

      new_lang = UNKNOWN_LANGUAGE;

    ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);

// Add just one element to resultchunk vector:

// For RTypeNone or RTypeOne

void JustOneItemToVector(ScriptScanner* scanner, const char* text,

                         Language lang1, int unmapped_offset, int unmapped_len,

                         ResultChunkVector* vec) {

  if (vec == NULL) {return;}

  if (kShowLettersOriginal) {

    fprintf(stderr, "map2original_ ");

    scanner->map2original_.DumpWindow();

    fprintf(stderr, "<br>\n");

    fprintf(stderr, "map2uplow_ ");

    scanner->map2uplow_.DumpWindow();

    fprintf(stderr, "<br>\n");

  if (kShowLettersOriginal) {

   // Optionally print the chunk lowercase letters/marks text

   string temp(&text[unmapped_offset], unmapped_len);

   fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",

           unmapped_offset, unmapped_offset + unmapped_len,

           GetHtmlEscapedText(temp).c_str());

  int mapped_offset = scanner->MapBack(unmapped_offset);

  int mapped_len =

    scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;

  if (kShowLettersOriginal) {

    // Optionally print the chunk original text

    string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);

    fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",

            mapped_offset, mapped_offset + mapped_len,

            GetHtmlEscapedText(temp2).c_str());

  ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);

// Debugging. Not thread safe. Defined in getonescriptspan

char* DisplayPiece(const char* next_byte_, int byte_length_);

// If high bit is on, take out high bit and add 2B to make table2 entries easy

inline int PrintableIndirect(int x) {

  if ((x & 0x80000000u) != 0) {

    return (x & ~0x80000000u) + 2000000000;

  return x;

void DumpHitBuffer(FILE* df, const char* text,

                   const ScoringHitBuffer* hitbuffer) {

  fprintf(df,

          "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",

          ULScriptCode(hitbuffer->ulscript),

          hitbuffer->next_base, hitbuffer->next_delta,

          hitbuffer->next_distinct);

  for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {

    if (i < hitbuffer->next_base) {

      fprintf(df, "Q[%d]%d,%d,%s ",

              i, hitbuffer->base[i].offset,

              PrintableIndirect(hitbuffer->base[i].indirect),

              DisplayPiece(&text[hitbuffer->base[i].offset], 6));

    if (i < hitbuffer->next_delta) {

      fprintf(df, "DL[%d]%d,%d,%s ",

              i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,

              DisplayPiece(&text[hitbuffer->delta[i].offset], 12));

    if (i < hitbuffer->next_distinct) {

      fprintf(df, "D[%d]%d,%d,%s ",

              i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,

              DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));

    if (i < hitbuffer->next_base) {

      fprintf(df, "<br>\n");

    if (i > 50) {break;}

  if (hitbuffer->next_base > 50) {

    int i = hitbuffer->next_base;

    fprintf(df, "Q[%d]%d,%d,%s ",

            i, hitbuffer->base[i].offset,

            PrintableIndirect(hitbuffer->base[i].indirect),

            DisplayPiece(&text[hitbuffer->base[i].offset], 6));

  if (hitbuffer->next_delta > 50) {

    int i = hitbuffer->next_delta;

    fprintf(df, "DL[%d]%d,%d,%s ",

            i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,

            DisplayPiece(&text[hitbuffer->delta[i].offset], 12));

  if (hitbuffer->next_distinct > 50) {

    int i = hitbuffer->next_distinct;

    fprintf(df, "D[%d]%d,%d,%s ",

            i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,

            DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));

  fprintf(df, "<br>\n");

void DumpLinearBuffer(FILE* df, const char* text,

                      const ScoringHitBuffer* hitbuffer) {

  fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",

          hitbuffer->next_linear);

  // Include the dummy entry off the end

  for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {

    if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}

    fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",

            i, hitbuffer->linear[i].offset,

            "UQLD"[hitbuffer->linear[i].type],

            hitbuffer->linear[i].langprob,

            DisplayPiece(&text[hitbuffer->linear[i].offset], 6));

  fprintf(df, "<br>\n");

  fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);

  for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {

    fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);

  fprintf(df, "<br>\n");

// Move this verbose debugging output to debug.cc eventually

void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {

  // Print chunksummary

  fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",

          cs->offset,

          cs->chunk_start,

          LanguageCode(static_cast<Language>(cs->lang1)),

          cs->score1,

          LanguageCode(static_cast<Language>(cs->lang2)),

          cs->score2,

          cs->bytes,

          cs->grams,

          ULScriptCode(static_cast<ULScript>(cs->ulscript)),

          cs->reliability_delta,

          cs->reliability_score);

void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {

  fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);

  fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "

              "bytesB ngrams# script rel_delta rel_score<br>\n");

  for (int i = 0; i <= summarybuffer->n; ++i) {

    fprintf(df, "[%d] ", i);

    DumpChunkSummary(df, &summarybuffer->chunksummary[i]);

  fprintf(df, "<br>\n");

// Within hitbufer->linear[]

// <-- prior chunk --><-- this chunk -->

// |                  |                 |

// linear0            linear1           linear2

//     lang0              lang1

// The goal of sharpening is to move this_linear to better separate langs

int BetterBoundary(const char* text,

                   ScoringHitBuffer* hitbuffer,

                   ScoringContext* scoringcontext,

                   uint16 pslang0, uint16 pslang1,

                   int linear0, int linear1, int linear2) {

  // Degenerate case, no change

  if ((linear2 - linear0) <= 8) {return linear1;}

  // Each diff gives pslang0 score - pslang1 score

  // Running diff has four entries + + + + followed by four entries - - - -

  // so that this value is maximal at the sharpest boundary between pslang0

  // (positive diffs) and pslang1 (negative diffs)

  int running_diff = 0;

  int diff[8];    // Ring buffer of pslang0-pslang1 differences

  // Initialize with first 8 diffs

  for (int i = linear0; i < linear0 + 8; ++i) {

    int j = i & 7;

    uint32 langprob = hitbuffer->linear[i].langprob;

    diff[j] = GetLangScore(langprob, pslang0) -

       GetLangScore(langprob, pslang1);

    if (i < linear0 + 4) {

      // First four diffs pslang0 - pslang1

      running_diff += diff[j];

    } else {

      // Second four diffs -(pslang0 - pslang1)

      running_diff -= diff[j];

  // Now scan for sharpest boundary. j is at left end of 8 entries

  // To be a boundary, there must be both >0 and <0 entries in the window

  int better_boundary_value = 0;

  int better_boundary = linear1;

  for (int i = linear0; i < linear2 - 8; ++i) {

    int j = i & 7;

    if (better_boundary_value < running_diff) {

      bool has_plus = false;

      bool has_minus = false;

      for (int kk = 0; kk < 8; ++kk) {

        if (diff[kk] > 0) {has_plus = true;}

        if (diff[kk] < 0) {has_minus = true;}

      if (has_plus && has_minus) {

        better_boundary_value = running_diff;

        better_boundary = i + 4;

    // Shift right one entry

    uint32 langprob = hitbuffer->linear[i + 8].langprob;

    int newdiff = GetLangScore(langprob, pslang0) -

       GetLangScore(langprob, pslang1);

    int middiff = diff[(i + 4) & 7];

    int olddiff = diff[j];

    diff[j] = newdiff;

    running_diff -= olddiff;                  // Remove left

    running_diff += 2 * middiff;              // Convert middle from - to +

    running_diff -= newdiff;                  // Insert right

  if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {

    Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);

    Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);

    fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",

            linear1, better_boundary,

            LanguageCode(lang0), LanguageCode(lang1));

    int lin0_off = hitbuffer->linear[linear0].offset;

    int lin1_off = hitbuffer->linear[linear1].offset;

    int lin2_off = hitbuffer->linear[linear2].offset;

    int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;

    int better_off = hitbuffer->linear[better_boundary].offset;

    int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;

    string old0(&text[lin0_off], lin1_off - lin0_off);

    string old1(&text[lin1_off], lin2_off - lin1_off);

    string new0(&text[lin0_off], better_offm1 - lin0_off);

    string new0m1(&text[better_offm1], better_off - better_offm1);

    string new1(&text[better_off], better_offp1 - better_off);

    string new1p1(&text[better_offp1], lin2_off - better_offp1);

    fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",

            GetHtmlEscapedText(old0).c_str(),

            GetHtmlEscapedText(old1).c_str(),

            GetHtmlEscapedText(new0).c_str(),

            GetHtmlEscapedText(new0m1).c_str(),

            GetHtmlEscapedText(new1).c_str(),

            GetHtmlEscapedText(new1p1).c_str());

    // Slow picture of differences per linear entry

    int d;

    for (int i = linear0; i < linear2; ++i) {

      if (i == better_boundary) {

        fprintf(scoringcontext->debug_file, "^^ ");

      uint32 langprob = hitbuffer->linear[i].langprob;

      d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);

      const char* s = "=";

      //if (d > 2) {s = "\xc2\xaf";}    // Macron

      if (d > 2) {s = "#";}

      else if (d > 0) {s = "+";}

      else if (d < -2) {s = "_";}

      else if (d < 0) {s = "-";}

      fprintf(scoringcontext->debug_file, "%s ", s);

    fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");

  return better_boundary;

// For all but the first summary, if its top language differs from

// the previous chunk, refine the boundary

// Linearized version

void SharpenBoundaries(const char* text,

                       bool more_to_come,

                       ScoringHitBuffer* hitbuffer,

                       ScoringContext* scoringcontext,

                       SummaryBuffer* summarybuffer) {

  int prior_linear = summarybuffer->chunksummary[0].chunk_start;

  uint16 prior_lang = summarybuffer->chunksummary[0].lang1;

  if (scoringcontext->flags_cld2_verbose) {

    fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");

  for (int i = 1; i < summarybuffer->n; ++i) {

    ChunkSummary* cs = &summarybuffer->chunksummary[i];

    uint16 this_lang = cs->lang1;

    if (this_lang == prior_lang) {

      prior_linear = cs->chunk_start;

      continue;

    int this_linear = cs->chunk_start;

    int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;

    // If this/prior in same close set, don't move boundary

    if (SameCloseSet(prior_lang, this_lang)) {

      prior_linear = this_linear;

      prior_lang = this_lang;

      continue;

    // Within hitbuffer->linear[]

    // <-- prior chunk --><-- this chunk -->

    // |                  |                 |

    // prior_linear       this_linear       next_linear

    //     prior_lang         this_lang

    // The goal of sharpening is to move this_linear to better separate langs

    uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,

                                    static_cast<Language>(prior_lang));

    uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,

                                    static_cast<Language>(this_lang));

    int better_linear = BetterBoundary(text,

                                       hitbuffer,

                                       scoringcontext,

                                       pslang0, pslang1,

                                       prior_linear, this_linear, next_linear);

    int old_offset = hitbuffer->linear[this_linear].offset;

    int new_offset = hitbuffer->linear[better_linear].offset;

    cs->chunk_start = better_linear;

    cs->offset = new_offset;

    // If this_linear moved right, make bytes smaller for this, larger for prior

    // If this_linear moved left, make bytes larger for this, smaller for prior

    cs->bytes -= (new_offset - old_offset);

    summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);

    this_linear = better_linear;    // Update so that next chunk doesn't intrude

    // Consider rescoring the two chunks

    // Update for next round (note: using pre-updated boundary)

    prior_linear = this_linear;

    prior_lang = this_lang;

// Make a langprob that gives small weight to the default language for ulscript

uint32 DefaultLangProb(ULScript ulscript) {

  Language default_lang = DefaultLanguage(ulscript);

  return MakeLangProb(default_lang, 1);

// Effectively, do a merge-sort based on text offsets

// Look up each indirect value in appropriate scoring table and keep

// just the resulting langprobs

void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,

                  ScoringHitBuffer* hitbuffer) {

  const CLD2TableSummary* base_obj;       // unigram or quadgram

  const CLD2TableSummary* base_obj2;      // quadgram dual table

  const CLD2TableSummary* delta_obj;      // bigram or octagram

  const CLD2TableSummary* distinct_obj;   // bigram or octagram

  uint16 base_hit;

  if (score_cjk) {

    base_obj = scoringcontext->scoringtables->unigram_compat_obj;

    base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;

    delta_obj = scoringcontext->scoringtables->deltabi_obj;

    distinct_obj = scoringcontext->scoringtables->distinctbi_obj;

    base_hit = UNIHIT;

  } else {

    base_obj = scoringcontext->scoringtables->quadgram_obj;

    base_obj2 = scoringcontext->scoringtables->quadgram_obj2;

    delta_obj = scoringcontext->scoringtables->deltaocta_obj;

    distinct_obj = scoringcontext->scoringtables->distinctocta_obj;

    base_hit = QUADHIT;

  int base_limit = hitbuffer->next_base;

  int delta_limit = hitbuffer->next_delta;

  int distinct_limit = hitbuffer->next_distinct;

  int base_i = 0;

  int delta_i = 0;

  int distinct_i = 0;

  int linear_i = 0;

  // Start with an initial base hit for the default language for this script

  // Inserting this avoids edge effects with no hits at all

  hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;

  hitbuffer->linear[linear_i].type = base_hit;

  hitbuffer->linear[linear_i].langprob =

    DefaultLangProb(scoringcontext->ulscript);

  ++linear_i;

  while ((base_i < base_limit) || (delta_i < delta_limit) ||

         (distinct_i < distinct_limit)) {

    int base_off = hitbuffer->base[base_i].offset;

    int delta_off = hitbuffer->delta[delta_i].offset;

    int distinct_off = hitbuffer->distinct[distinct_i].offset;

    // Do delta and distinct first, so that they are not lost at base_limit

    if ((delta_i < delta_limit) &&

        (delta_off <= base_off) && (delta_off <= distinct_off)) {

      // Add delta entry

      int indirect = hitbuffer->delta[delta_i].indirect;

      ++delta_i;

      uint32 langprob = delta_obj->kCLDTableInd[indirect];

      if (langprob > 0) {

        hitbuffer->linear[linear_i].offset = delta_off;

        hitbuffer->linear[linear_i].type = DELTAHIT;

        hitbuffer->linear[linear_i].langprob = langprob;

        ++linear_i;

    else if ((distinct_i < distinct_limit) &&

             (distinct_off <= base_off) && (distinct_off <= delta_off)) {

      // Add distinct entry

      int indirect = hitbuffer->distinct[distinct_i].indirect;

      ++distinct_i;

      uint32 langprob = distinct_obj->kCLDTableInd[indirect];

      if (langprob > 0) {

        hitbuffer->linear[linear_i].offset = distinct_off;

        hitbuffer->linear[linear_i].type = DISTINCTHIT;

        hitbuffer->linear[linear_i].langprob = langprob;

        ++linear_i;

    else {

      // Add one or two base entries

      int indirect = hitbuffer->base[base_i].indirect;

      // First, get right scoring table

      const CLD2TableSummary* local_base_obj = base_obj;

      if ((indirect & 0x80000000u) != 0) {

        local_base_obj = base_obj2;

        indirect &= ~0x80000000u;

      ++base_i;

      // One langprob in kQuadInd[0..SingleSize),

      // two in kQuadInd[SingleSize..Size)

      if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {

        // Up to three languages at indirect

        uint32 langprob = local_base_obj->kCLDTableInd[indirect];

        if (langprob > 0) {

          hitbuffer->linear[linear_i].offset = base_off;

          hitbuffer->linear[linear_i].type = base_hit;

          hitbuffer->linear[linear_i].langprob = langprob;

          ++linear_i;

      } else {

        // Up to six languages at start + 2 * (indirect - start)

        indirect += (indirect - local_base_obj->kCLDTableSizeOne);

        uint32 langprob = local_base_obj->kCLDTableInd[indirect];

        uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];

        if (langprob > 0) {

          hitbuffer->linear[linear_i].offset = base_off;

          hitbuffer->linear[linear_i].type = base_hit;

          hitbuffer->linear[linear_i].langprob = langprob;

          ++linear_i;

        if (langprob2 > 0) {

          hitbuffer->linear[linear_i].offset = base_off;

          hitbuffer->linear[linear_i].type = base_hit;

          hitbuffer->linear[linear_i].langprob = langprob2;

          ++linear_i;

  // Update

  hitbuffer->next_linear = linear_i;

  // Add a dummy entry off the end, just to capture final offset

  hitbuffer->linear[linear_i].offset =

  hitbuffer->base[hitbuffer->next_base].offset;

  hitbuffer->linear[linear_i].langprob = 0;

// Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits

void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {

  int chunksize;

  uint16 base_hit;

  if (score_cjk) {

    chunksize = kChunksizeUnis;

    base_hit = UNIHIT;

  } else {

    chunksize = kChunksizeQuads;

    base_hit = QUADHIT;

  int linear_i = 0;

  int linear_off_end = hitbuffer->next_linear;

  int text_i = letter_offset;               // Next unseen text offset

  int next_chunk_start = 0;

  int bases_left = hitbuffer->next_base;

  while (bases_left > 0) {

    // Linearize one chunk

    int base_len = chunksize;     // Default; may be changed below

    if (bases_left < (chunksize + (chunksize >> 1))) {

      // If within 1.5 chunks of the end, avoid runts by using it all

      base_len = bases_left;

    } else if (bases_left < (2 * chunksize)) {

      // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)

      base_len = (bases_left + 1) >> 1;

    hitbuffer->chunk_start[next_chunk_start] = linear_i;

    hitbuffer->chunk_offset[next_chunk_start] = text_i;

    ++next_chunk_start;

    int base_count = 0;

    while ((base_count < base_len) && (linear_i < linear_off_end)) {

      if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}

      ++linear_i;

    text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset

    bases_left -= base_len;

  // If no base hits at all, make a single dummy chunk

  if (next_chunk_start == 0) {

     hitbuffer->chunk_start[next_chunk_start] = 0;

     hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;

     ++next_chunk_start;

  // Remember the linear array start of dummy entry

  hitbuffer->next_chunk_start = next_chunk_start;

  // Add a dummy entry off the end, just to capture final linear subscr

  hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;

  hitbuffer->chunk_offset[next_chunk_start] = text_i;

// Merge-sort the individual hit arrays, go indirect on the scoring subscripts,

// break linear array into chunks.

//

// Input:

//  hitbuffer base, delta, distinct arrays

// Output:

//  linear array

//  chunk_start array

//

void LinearizeHitBuffer(int letter_offset,

                        ScoringContext* scoringcontext,

                        bool more_to_come, bool score_cjk,

                        ScoringHitBuffer* hitbuffer) {

  LinearizeAll(scoringcontext, score_cjk, hitbuffer);

  ChunkAll(letter_offset, score_cjk, hitbuffer);

// The hitbuffer is in an awkward form -- three sets of base/delta/distinct

// scores, each with an indirect subscript to one of six scoring tables, some

// of which can yield two langprobs for six languages, others one langprob for

// three languages. The only correlation between base/delta/distinct is their

// offsets into the letters-only text buffer.

//

// SummaryBuffer needs to be built to linear, giving linear offset of start of

// each chunk

//

// So we first do all the langprob lookups and merge-sort by offset to make

// a single linear vector, building a side vector of chunk beginnings as we go.

// The sharpening is simply moving the beginnings, scoring is a simple linear

// sweep, etc.

void ProcessHitBuffer(const LangSpan& scriptspan,

                      int letter_offset,

                      ScoringContext* scoringcontext,

                      DocTote* doc_tote,

                      ResultChunkVector* vec,

                      bool more_to_come, bool score_cjk,

                      ScoringHitBuffer* hitbuffer) {

  if (scoringcontext->flags_cld2_verbose) {

    fprintf(scoringcontext->debug_file, "Hitbuffer[) ");

    DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);

  LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,

                     hitbuffer);

  if (scoringcontext->flags_cld2_verbose) {

    fprintf(scoringcontext->debug_file, "Linear[) ");

    DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);

  SummaryBuffer summarybuffer;

  summarybuffer.n = 0;

  ChunkSpan last_cspan;

  ScoreAllHits(scriptspan.text, scriptspan.ulscript,

                    more_to_come, score_cjk, hitbuffer,

                    scoringcontext,

                    &summarybuffer, &last_cspan);

  if (scoringcontext->flags_cld2_verbose) {

    DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);

  if (vec != NULL) {

    // Sharpen boundaries of summarybuffer

    // This is not a high-performance path

    SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,

                      &summarybuffer);

    // Show after the sharpening

    // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,

    //             hitbuffer, scoringcontext, &summarybuffer);

    if (scoringcontext->flags_cld2_verbose) {

      DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);

  SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);

  SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,

                        &summarybuffer, more_to_come, vec);

void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {

  // Splice hitbuffer and summarybuffer for next round. With big chunks and

  // distinctive-word state carried across chunks, we might not need to do this.

  hitbuffer->next_base = 0;

  hitbuffer->next_delta = 0;

  hitbuffer->next_distinct = 0;

  hitbuffer->next_linear = 0;

  hitbuffer->next_chunk_start = 0;

  hitbuffer->lowest_offset = next_offset;

// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating

// scoringcontext

void ScoreEntireScriptSpan(const LangSpan& scriptspan,

                           ScoringContext* scoringcontext,

                           DocTote* doc_tote,

                           ResultChunkVector* vec) {

  int bytes = scriptspan.text_bytes;

  // Artificially set score to 1024 per 1KB, or 1 per byte

  int score = bytes;

  int reliability = 100;

  // doc_tote uses full languages

  Language one_one_lang = DefaultLanguage(scriptspan.ulscript);

  doc_tote->Add(one_one_lang, bytes, score, reliability);

  if (scoringcontext->flags_cld2_html) {

    ChunkSummary chunksummary = {

      1, 0,

      one_one_lang, UNKNOWN_LANGUAGE, score, 1,

      bytes, 0, scriptspan.ulscript, reliability, reliability

};

    CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,

               false, false, NULL,

               scoringcontext, NULL, &chunksummary);

  // First byte is always a space

  JustOneItemToVector(scoringcontext->scanner, scriptspan.text,

                      one_one_lang, 1, bytes - 1, vec);

  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext

void ScoreCJKScriptSpan(const LangSpan& scriptspan,

                        ScoringContext* scoringcontext,

                        DocTote* doc_tote,

                        ResultChunkVector* vec) {

  // Allocate three parallel arrays of scoring hits

  ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;

  hitbuffer->init();

  hitbuffer->ulscript = scriptspan.ulscript;

  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  scoringcontext->oldest_distinct_boost = 0;

  // Incoming scriptspan has a single leading space at scriptspan.text[0]

  // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]

  int letter_offset = 1;        // Skip initial space

  hitbuffer->lowest_offset = letter_offset;

  int letter_limit = scriptspan.text_bytes;

  while (letter_offset < letter_limit) {

    if (scoringcontext->flags_cld2_verbose) {

      fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",

              letter_offset, letter_limit);

//

    // Fill up one hitbuffer, possibly splicing onto previous fragment

//

    // NOTE: GetUniHits deals with close repeats

    // NOTE: After last chunk there is always a hitbuffer entry with an offset

    // just off the end of the text = next_offset.

    int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,

                                  scoringcontext, hitbuffer);

    // NOTE: GetBiHitVectors deals with close repeats,

    // does one hash and two lookups (delta and distinct) per word

    GetBiHits(scriptspan.text, letter_offset, next_offset,

                scoringcontext, hitbuffer);

//

    // Score one hitbuffer in chunks to summarybuffer

//

    bool more_to_come = next_offset < letter_limit;

    bool score_cjk = true;

    ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,

                     more_to_come, score_cjk, hitbuffer);

    SpliceHitBuffer(hitbuffer, next_offset);

    letter_offset = next_offset;

  delete hitbuffer;

  // Context across buffers is not connected yet

  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext

// We have a scriptspan with all lowercase text in one script. Look up

// quadgrams and octagrams, saving the hits in three parallel vectors.

// Score from those vectors in chunks, toting each chunk to get a single

// language, and combining into the overall document score. The hit vectors

// in general are not big enough to handle and entire scriptspan, so

// repeat until the entire scriptspan is scored.

// Caller deals with minimizing numbr of runt scriptspans

// This routine deals with minimizing number of runt chunks.

//

// Returns updated scoringcontext

// Returns updated doc_tote

// If vec != NULL, appends to that vector of ResultChunk's

void ScoreQuadScriptSpan(const LangSpan& scriptspan,

                         ScoringContext* scoringcontext,

                         DocTote* doc_tote,

                         ResultChunkVector* vec) {

  // Allocate three parallel arrays of scoring hits

  ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;

  hitbuffer->init();

  hitbuffer->ulscript = scriptspan.ulscript;

  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  scoringcontext->oldest_distinct_boost = 0;

  // Incoming scriptspan has a single leading space at scriptspan.text[0]

  // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]

  int letter_offset = 1;        // Skip initial space

  hitbuffer->lowest_offset = letter_offset;

  int letter_limit = scriptspan.text_bytes;

  while (letter_offset < letter_limit) {

//

    // Fill up one hitbuffer, possibly splicing onto previous fragment

//

    // NOTE: GetQuadHits deals with close repeats

    // NOTE: After last chunk there is always a hitbuffer entry with an offset

    // just off the end of the text = next_offset.

    int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,

                                  scoringcontext, hitbuffer);

    // If true, there is more text to process in this scriptspan

    // NOTE: GetOctaHitVectors deals with close repeats,

    // does one hash and two lookups (delta and distinct) per word

    GetOctaHits(scriptspan.text, letter_offset, next_offset,

                scoringcontext, hitbuffer);

//

    // Score one hitbuffer in chunks to summarybuffer

//

    bool more_to_come = next_offset < letter_limit;

    bool score_cjk = false;

    ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,

                     more_to_come, score_cjk, hitbuffer);

    SpliceHitBuffer(hitbuffer, next_offset);

    letter_offset = next_offset;

  delete hitbuffer;

// Score one scriptspan into doc_tote and vec, updating scoringcontext

// Inputs:

//  One scriptspan of perhaps 40-60KB, all same script lower-case letters

//    and single ASCII spaces. First character is a space to allow simple

//    begining-of-word detect. End of buffer has three spaces and NUL to

//    allow easy scan-to-end-of-word.

//  Scoring context of

//    scoring tables

//    flags

//    running boosts

// Outputs:

//  Updated doc_tote giving overall languages and byte counts

//  Optional updated chunk vector giving offset, length, language

//

// Caller initializes flags, boosts, doc_tote and vec.

// Caller aggregates across multiple scriptspans

// Caller calculates final document result

// Caller deals with detecting and triggering suppression of repeated text.

//

// This top-level routine just chooses the recognition type and calls one of

// the next-level-down routines.

//

void ScoreOneScriptSpan(const LangSpan& scriptspan,

                        ScoringContext* scoringcontext,

                        DocTote* doc_tote,

                        ResultChunkVector* vec) {

  if (scoringcontext->flags_cld2_verbose) {

    fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",

            ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);

    // Optionally print the chunk lowercase letters/marks text

    string temp(&scriptspan.text[0], scriptspan.text_bytes);

    fprintf(scoringcontext->debug_file, "'%s'",

            GetHtmlEscapedText(temp).c_str());

    fprintf(scoringcontext->debug_file, "<br>\n");

  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  scoringcontext->oldest_distinct_boost = 0;

  ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);

  if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {

    rtype = RTypeMany;

  switch (rtype) {

  case RTypeNone:

  case RTypeOne:

    ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

    break;

  case RTypeCJK:

    ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

    break;

  case RTypeMany:

    ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

    break;

}       // End namespace CLD2

Source code

Revision control

Copy as Markdown

Other Tools