Source code

Revision control

Other Tools

1
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
// Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11
// "top-level directory" in the above notice refers to
12
// third_party/rust/encoding_c/.
13
14
#ifndef mozilla_Encoding_h
15
#define mozilla_Encoding_h
16
17
#include "mozilla/CheckedInt.h"
18
#include "mozilla/Maybe.h"
19
#include "mozilla/NotNull.h"
20
#include "mozilla/Span.h"
21
#include "mozilla/Tuple.h"
22
#include "nsString.h"
23
24
namespace mozilla {
25
class Encoding;
26
class Decoder;
27
class Encoder;
28
}; // namespace mozilla
29
30
#define ENCODING_RS_ENCODING mozilla::Encoding
31
#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
32
mozilla::NotNull<const mozilla::Encoding*>
33
#define ENCODING_RS_ENCODER mozilla::Encoder
34
#define ENCODING_RS_DECODER mozilla::Decoder
35
36
#include "encoding_rs.h"
37
38
extern "C" {
39
40
nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
41
uint8_t const* src, size_t src_len,
42
nsAString* dst);
43
44
nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
45
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
46
nsAString* dst);
47
48
nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
49
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
50
nsAString* dst);
51
52
nsresult
53
mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
54
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
55
nsAString* dst);
56
57
nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
58
char16_t const* src, size_t src_len,
59
nsACString* dst);
60
61
nsresult mozilla_encoding_decode_to_nscstring(
62
mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
63
64
nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
65
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
66
67
nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
68
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
69
70
nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
71
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
72
nsACString* dst, size_t already_validated);
73
74
nsresult
75
mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
76
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
77
78
nsresult mozilla_encoding_encode_from_nscstring(
79
mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
80
81
} // extern "C"
82
83
namespace mozilla {
84
85
/**
86
* Return value from `Decoder`/`Encoder` to indicate that input
87
* was exhausted.
88
*/
89
const uint32_t kInputEmpty = INPUT_EMPTY;
90
91
/**
92
* Return value from `Decoder`/`Encoder` to indicate that output
93
* space was insufficient.
94
*/
95
const uint32_t kOutputFull = OUTPUT_FULL;
96
97
/**
98
* An encoding as defined in the Encoding Standard
100
*
101
* See https://docs.rs/encoding_rs/ for the Rust API docs.
102
*
103
* An _encoding_ defines a mapping from a byte sequence to a Unicode code point
104
* sequence and, in most cases, vice versa. Each encoding has a name, an output
105
* encoding, and one or more labels.
106
*
107
* _Labels_ are ASCII-case-insensitive strings that are used to identify an
108
* encoding in formats and protocols. The _name_ of the encoding is the
109
* preferred label in the case appropriate for returning from the
110
* `characterSet` property of the `Document` DOM interface, except for
111
* the replacement encoding whose name is not one of its labels.
112
*
113
* The _output encoding_ is the encoding used for form submission and URL
114
* parsing on Web pages in the encoding. This is UTF-8 for the replacement,
115
* UTF-16LE and UTF-16BE encodings and the encoding itself for other
116
* encodings.
117
*
118
* # Streaming vs. Non-Streaming
119
*
120
* When you have the entire input in a single buffer, you can use the
121
* methods `Decode()`, `DecodeWithBOMRemoval()`,
122
* `DecodeWithoutBOMHandling()`,
123
* `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
124
* `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
125
* NewEncoder()` methods), these methods perform heap allocations. You should
126
* the `Decoder` and `Encoder` objects when your input is split into multiple
127
* buffers or when you want to control the allocation of the output buffers.
128
*
129
* # Instances
130
*
131
* All instances of `Encoding` are statically allocated and have the process's
132
* lifetime. There is precisely one unique `Encoding` instance for each
133
* encoding defined in the Encoding Standard.
134
*
135
* To obtain a reference to a particular encoding whose identity you know at
136
* compile time, use a `static` that refers to encoding. There is a `static`
137
* for each encoding. The `static`s are named in all caps with hyphens
138
* replaced with underscores and with `_ENCODING` appended to the
139
* name. For example, if you know at compile time that you will want to
140
* decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
141
*
142
* If you don't know what encoding you need at compile time and need to
143
* dynamically get an encoding by label, use `Encoding::for_label()`.
144
*
145
* Pointers to `Encoding` can be compared with `==` to check for the sameness
146
* of two encodings.
147
*
148
* A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
149
* to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
150
* `const mozilla::Encoding*` in the C signature and
151
* `*const encoding_rs::Encoding` is the corresponding Rust signature.
152
*/
153
class Encoding final {
154
public:
155
/**
156
* Implements the _get an encoding_ algorithm
158
*
159
* If, after ASCII-lowercasing and removing leading and trailing
160
* whitespace, the argument matches a label defined in the Encoding
161
* Standard, `const Encoding*` representing the corresponding
162
* encoding is returned. If there is no match, `nullptr` is returned.
163
*
164
* This is the right method to use if the action upon the method returning
165
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
166
* instead. When the action upon the method returning `nullptr` is not to
167
* proceed with a fallback but to refuse processing,
168
* `ForLabelNoReplacement()` is more appropriate.
169
*/
170
static inline const Encoding* ForLabel(Span<const char> aLabel) {
171
return encoding_for_label(
172
reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
173
}
174
175
/**
176
* `nsAString` argument version. See above for docs.
177
*/
178
static inline const Encoding* ForLabel(const nsAString& aLabel) {
179
return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
180
}
181
182
/**
183
* This method behaves the same as `ForLabel()`, except when `ForLabel()`
184
* would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
185
*
186
* This method is useful in scenarios where a fatal error is required
187
* upon invalid label, because in those cases the caller typically wishes
188
* to treat the labels that map to the replacement encoding as fatal
189
* errors, too.
190
*
191
* It is not OK to use this method when the action upon the method returning
192
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
193
* such a case, the `ForLabel()` method should be used instead in order to
194
* avoid unsafe fallback for labels that `ForLabel()` maps to
195
* `REPLACEMENT_ENCODING`.
196
*/
197
static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
198
return encoding_for_label_no_replacement(
199
reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
200
}
201
202
/**
203
* `nsAString` argument version. See above for docs.
204
*/
205
static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
206
return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
207
}
208
209
/**
210
* Performs non-incremental BOM sniffing.
211
*
212
* The argument must either be a buffer representing the entire input
213
* stream (non-streaming case) or a buffer representing at least the first
214
* three bytes of the input stream (streaming case).
215
*
216
* Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
217
* or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
218
* UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
219
*/
220
static inline Tuple<const Encoding*, size_t> ForBOM(
221
Span<const uint8_t> aBuffer) {
222
size_t len = aBuffer.Length();
223
const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
224
return MakeTuple(encoding, len);
225
}
226
227
/**
228
* Writes the name of this encoding into `aName`.
229
*
230
* This name is appropriate to return as-is from the DOM
231
* `document.characterSet` property.
232
*/
233
inline void Name(nsACString& aName) const {
234
aName.SetLength(ENCODING_NAME_MAX_LENGTH);
235
size_t length =
236
encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
237
aName.SetLength(length); // truncation is the 64-bit case is OK
238
}
239
240
/**
241
* Checks whether the _output encoding_ of this encoding can encode every
242
* Unicode code point. (Only true if the output encoding is UTF-8.)
243
*/
244
inline bool CanEncodeEverything() const {
245
return encoding_can_encode_everything(this);
246
}
247
248
/**
249
* Checks whether this encoding maps one byte to one Basic Multilingual
250
* Plane code point (i.e. byte length equals decoded UTF-16 length) and
251
* vice versa (for mappable characters).
252
*
253
* `true` iff this encoding is on the list of Legacy single-byte
255
* in the spec or x-user-defined.
256
*/
257
inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
258
259
/**
260
* Checks whether the bytes 0x00...0x7F map exclusively to the characters
261
* U+0000...U+007F and vice versa.
262
*/
263
inline bool IsAsciiCompatible() const {
264
return encoding_is_ascii_compatible(this);
265
}
266
267
/**
268
* Checks whether this is a Japanese legacy encoding.
269
*/
270
inline bool IsJapaneseLegacy() const {
271
return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
272
this == ISO_2022_JP_ENCODING;
273
}
274
275
/**
276
* Returns the _output encoding_ of this encoding. This is UTF-8 for
277
* UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
278
*/
279
inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
280
return WrapNotNull(encoding_output_encoding(this));
281
}
282
283
/**
284
* Decode complete input to `nsACString` _with BOM sniffing_ and with
285
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
286
* entire input is available as a single buffer (i.e. the end of the
287
* buffer marks the end of the stream).
288
*
289
* This method implements the (non-streaming version of) the
290
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
291
*
292
* The second item in the returned tuple is the encoding that was actually
293
* used (which may differ from this encoding thanks to BOM sniffing).
294
*
295
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
296
* if there were malformed sequences (that were replaced with the
297
* REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
298
* tuple.
299
*
300
* The backing buffer of the string isn't copied if the input buffer
301
* is heap-allocated and decoding from UTF-8 and the input is valid
302
* BOMless UTF-8, decoding from an ASCII-compatible encoding and
303
* the input is valid ASCII or decoding from ISO-2022-JP and the
304
* input stays in the ASCII state of ISO-2022-JP. It is OK to pass
305
* the same string as both arguments.
306
*
307
* _Note:_ It is wrong to use this when the input buffer represents only
308
* a segment of the input instead of the whole input. Use `NewDecoder()`
309
* when decoding segmented input.
310
*/
311
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
312
const nsACString& aBytes, nsACString& aOut) const {
313
const Encoding* encoding = this;
314
const nsACString* bytes = &aBytes;
315
nsACString* out = &aOut;
316
nsresult rv;
317
if (bytes == out) {
318
nsAutoCString temp(aBytes);
319
rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
320
} else {
321
rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
322
}
323
return MakeTuple(rv, WrapNotNull(encoding));
324
}
325
326
/**
327
* Decode complete input to `nsAString` _with BOM sniffing_ and with
328
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
329
* entire input is available as a single buffer (i.e. the end of the
330
* buffer marks the end of the stream).
331
*
332
* This method implements the (non-streaming version of) the
333
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
334
*
335
* The second item in the returned tuple is the encoding that was actually
336
* used (which may differ from this encoding thanks to BOM sniffing).
337
*
338
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
339
* if there were malformed sequences (that were replaced with the
340
* REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
341
* tuple.
342
*
343
* _Note:_ It is wrong to use this when the input buffer represents only
344
* a segment of the input instead of the whole input. Use `NewDecoder()`
345
* when decoding segmented input.
346
*/
347
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
348
Span<const uint8_t> aBytes, nsAString& aOut) const {
349
const Encoding* encoding = this;
350
nsresult rv = mozilla_encoding_decode_to_nsstring(
351
&encoding, aBytes.Elements(), aBytes.Length(), &aOut);
352
return MakeTuple(rv, WrapNotNull(encoding));
353
}
354
355
/**
356
* Decode complete input to `nsACString` _with BOM removal_ and with
357
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
358
* entire input is available as a single buffer (i.e. the end of the
359
* buffer marks the end of the stream).
360
*
361
* When invoked on `UTF_8`, this method implements the (non-streaming
362
* version of) the _UTF-8 decode_
364
*
365
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
366
* if there were malformed sequences (that were replaced with the
367
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
368
*
369
* The backing buffer of the string isn't copied if the input buffer
370
* is heap-allocated and decoding from UTF-8 and the input is valid
371
* BOMless UTF-8, decoding from an ASCII-compatible encoding and
372
* the input is valid ASCII or decoding from ISO-2022-JP and the
373
* input stays in the ASCII state of ISO-2022-JP. It is OK to pass
374
* the same string as both arguments.
375
*
376
* _Note:_ It is wrong to use this when the input buffer represents only
377
* a segment of the input instead of the whole input. Use
378
* `NewDecoderWithBOMRemoval()` when decoding segmented input.
379
*/
380
inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
381
nsACString& aOut) const {
382
const nsACString* bytes = &aBytes;
383
nsACString* out = &aOut;
384
if (bytes == out) {
385
nsAutoCString temp(aBytes);
386
return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
387
out);
388
}
389
return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
390
out);
391
}
392
393
/**
394
* Decode complete input to `nsAString` _with BOM removal_ and with
395
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
396
* entire input is available as a single buffer (i.e. the end of the
397
* buffer marks the end of the stream).
398
*
399
* When invoked on `UTF_8`, this method implements the (non-streaming
400
* version of) the _UTF-8 decode_
402
*
403
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
404
* if there were malformed sequences (that were replaced with the
405
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
406
*
407
* _Note:_ It is wrong to use this when the input buffer represents only
408
* a segment of the input instead of the whole input. Use
409
* `NewDecoderWithBOMRemoval()` when decoding segmented input.
410
*/
411
inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
412
nsAString& aOut) const {
413
return mozilla_encoding_decode_to_nsstring_with_bom_removal(
414
this, aBytes.Elements(), aBytes.Length(), &aOut);
415
}
416
417
/**
418
* Decode complete input to `nsACString` _without BOM handling_ and
419
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
420
* the entire input is available as a single buffer (i.e. the end of the
421
* buffer marks the end of the stream).
422
*
423
* When invoked on `UTF_8`, this method implements the (non-streaming
424
* version of) the _UTF-8 decode without BOM_
426
*
427
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
428
* if there were malformed sequences (that were replaced with the
429
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
430
*
431
* The backing buffer of the string isn't copied if the input buffer
432
* is heap-allocated and decoding from UTF-8 and the input is valid
433
* UTF-8, decoding from an ASCII-compatible encoding and the input
434
* is valid ASCII or decoding from ISO-2022-JP and the input stays
435
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
436
* as both arguments.
437
*
438
* _Note:_ It is wrong to use this when the input buffer represents only
439
* a segment of the input instead of the whole input. Use
440
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
441
*/
442
inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
443
nsACString& aOut) const {
444
const nsACString* bytes = &aBytes;
445
nsACString* out = &aOut;
446
if (bytes == out) {
447
nsAutoCString temp(aBytes);
448
return mozilla_encoding_decode_to_nscstring_without_bom_handling(
449
this, &temp, out);
450
}
451
return mozilla_encoding_decode_to_nscstring_without_bom_handling(
452
this, bytes, out);
453
}
454
455
/**
456
* Decode complete input to `nsAString` _without BOM handling_ and
457
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
458
* the entire input is available as a single buffer (i.e. the end of the
459
* buffer marks the end of the stream).
460
*
461
* When invoked on `UTF_8`, this method implements the (non-streaming
462
* version of) the _UTF-8 decode without BOM_
464
*
465
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
466
* if there were malformed sequences (that were replaced with the
467
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
468
*
469
* _Note:_ It is wrong to use this when the input buffer represents only
470
* a segment of the input instead of the whole input. Use
471
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
472
*/
473
inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
474
nsAString& aOut) const {
475
return mozilla_encoding_decode_to_nsstring_without_bom_handling(
476
this, aBytes.Elements(), aBytes.Length(), &aOut);
477
}
478
479
/**
480
* Decode complete input to `nsACString` _without BOM handling_ and
481
* _with malformed sequences treated as fatal_ when the entire input is
482
* available as a single buffer (i.e. the end of the buffer marks the end
483
* of the stream).
484
*
485
* When invoked on `UTF_8`, this method implements the (non-streaming
486
* version of) the _UTF-8 decode without BOM or fail_
488
* spec concept.
489
*
490
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
491
* if a malformed sequence was encountered and `NS_OK` otherwise.
492
*
493
* The backing buffer of the string isn't copied if the input buffer
494
* is heap-allocated and decoding from UTF-8 and the input is valid
495
* UTF-8, decoding from an ASCII-compatible encoding and the input
496
* is valid ASCII or decoding from ISO-2022-JP and the input stays
497
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
498
* as both arguments.
499
*
500
* _Note:_ It is wrong to use this when the input buffer represents only
501
* a segment of the input instead of the whole input. Use
502
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
503
*/
504
inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
505
const nsACString& aBytes, nsACString& aOut) const {
506
const nsACString* bytes = &aBytes;
507
nsACString* out = &aOut;
508
if (bytes == out) {
509
nsAutoCString temp(aBytes);
510
return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
511
this, &temp, out);
512
}
513
return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
514
this, bytes, out);
515
}
516
517
/**
518
* Decode complete input to `nsACString` _without BOM handling_ and
519
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
520
* the entire input is available as a single buffer (i.e. the end of the
521
* buffer marks the end of the stream) _asserting that a number of bytes
522
* from the start are already known to be valid UTF-8_.
523
*
524
* The use case for this method is avoiding copying when dealing with
525
* input that has a UTF-8 BOM. _When in doubt, do not use this method._
526
*
527
* When invoked on `UTF_8`, this method implements the (non-streaming
528
* version of) the _UTF-8 decode without BOM_
530
*
531
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
532
* if there were malformed sequences (that were replaced with the
533
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
534
*
535
* _Note:_ It is wrong to use this when the input buffer represents only
536
* a segment of the input instead of the whole input. Use
537
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
538
*
539
* # Safety
540
*
541
* The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
542
* `aBytes` _must not_ alias the buffer (if any) of `aOut`.
543
*/
544
inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
545
nsACString& aOut,
546
size_t aAlreadyValidated) const {
547
return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
548
this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
549
}
550
551
/**
552
* Decode complete input to `nsAString` _without BOM handling_ and
553
* _with malformed sequences treated as fatal_ when the entire input is
554
* available as a single buffer (i.e. the end of the buffer marks the end
555
* of the stream).
556
*
557
* When invoked on `UTF_8`, this method implements the (non-streaming
558
* version of) the _UTF-8 decode without BOM or fail_
560
* spec concept.
561
*
562
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
563
* if a malformed sequence was encountered and `NS_OK` otherwise.
564
*
565
* _Note:_ It is wrong to use this when the input buffer represents only
566
* a segment of the input instead of the whole input. Use
567
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
568
*/
569
inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
570
Span<const uint8_t> aBytes, nsAString& aOut) const {
571
return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
572
this, aBytes.Elements(), aBytes.Length(), &aOut);
573
}
574
575
/**
576
* Encode complete input to `nsACString` with unmappable characters
577
* replaced with decimal numeric character references when the entire input
578
* is available as a single buffer (i.e. the end of the buffer marks the
579
* end of the stream).
580
*
581
* This method implements the (non-streaming version of) the
582
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
583
*
584
* The second item in the returned tuple is the encoding that was actually
585
* used (which may differ from this encoding thanks to some encodings
586
* having UTF-8 as their output encoding).
587
*
588
* The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
589
* the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
590
* `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
591
* replaced with numeric character references) and `NS_OK` otherwise.
592
*
593
* The backing buffer of the string isn't copied if the input buffer
594
* is heap-allocated and encoding to UTF-8 and the input is valid
595
* UTF-8, encoding to an ASCII-compatible encoding and the input
596
* is valid ASCII or encoding from ISO-2022-JP and the input stays
597
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
598
* as both arguments.
599
*
600
* _Note:_ It is wrong to use this when the input buffer represents only
601
* a segment of the input instead of the whole input. Use `NewEncoder()`
602
* when encoding segmented output.
603
*/
604
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
605
const nsACString& aString, nsACString& aOut) const {
606
const Encoding* encoding = this;
607
const nsACString* string = &aString;
608
nsACString* out = &aOut;
609
nsresult rv;
610
if (string == out) {
611
nsAutoCString temp(aString);
612
rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
613
} else {
614
rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
615
}
616
return MakeTuple(rv, WrapNotNull(encoding));
617
}
618
619
/**
620
* Encode complete input to `nsACString` with unmappable characters
621
* replaced with decimal numeric character references when the entire input
622
* is available as a single buffer (i.e. the end of the buffer marks the
623
* end of the stream).
624
*
625
* This method implements the (non-streaming version of) the
626
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
627
*
628
* The second item in the returned tuple is the encoding that was actually
629
* used (which may differ from this encoding thanks to some encodings
630
* having UTF-8 as their output encoding).
631
*
632
* The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
633
* OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
634
* were replaced with numeric character references) and `NS_OK` otherwise.
635
636
* _Note:_ It is wrong to use this when the input buffer represents only
637
* a segment of the input instead of the whole input. Use `NewEncoder()`
638
* when encoding segmented output.
639
*/
640
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
641
Span<const char16_t> aString, nsACString& aOut) const {
642
const Encoding* encoding = this;
643
nsresult rv = mozilla_encoding_encode_from_utf16(
644
&encoding, aString.Elements(), aString.Length(), &aOut);
645
return MakeTuple(rv, WrapNotNull(encoding));
646
}
647
648
/**
649
* Instantiates a new decoder for this encoding with BOM sniffing enabled.
650
*
651
* BOM sniffing may cause the returned decoder to morph into a decoder
652
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
653
*/
654
inline UniquePtr<Decoder> NewDecoder() const {
655
UniquePtr<Decoder> decoder(encoding_new_decoder(this));
656
return decoder;
657
}
658
659
/**
660
* Instantiates a new decoder for this encoding with BOM sniffing enabled
661
* into memory occupied by a previously-instantiated decoder.
662
*
663
* BOM sniffing may cause the returned decoder to morph into a decoder
664
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
665
*/
666
inline void NewDecoderInto(Decoder& aDecoder) const {
667
encoding_new_decoder_into(this, &aDecoder);
668
}
669
670
/**
671
* Instantiates a new decoder for this encoding with BOM removal.
672
*
673
* If the input starts with bytes that are the BOM for this encoding,
674
* those bytes are removed. However, the decoder never morphs into a
675
* decoder for another encoding: A BOM for another encoding is treated as
676
* (potentially malformed) input to the decoding algorithm for this
677
* encoding.
678
*/
679
inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
680
UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
681
return decoder;
682
}
683
684
/**
685
* Instantiates a new decoder for this encoding with BOM removal
686
* into memory occupied by a previously-instantiated decoder.
687
*
688
* If the input starts with bytes that are the BOM for this encoding,
689
* those bytes are removed. However, the decoder never morphs into a
690
* decoder for another encoding: A BOM for another encoding is treated as
691
* (potentially malformed) input to the decoding algorithm for this
692
* encoding.
693
*/
694
inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
695
encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
696
}
697
698
/**
699
* Instantiates a new decoder for this encoding with BOM handling disabled.
700
*
701
* If the input starts with bytes that look like a BOM, those bytes are
702
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
703
* for another encoding.)
704
*
705
* _Note:_ If the caller has performed BOM sniffing on its own but has not
706
* removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
707
* instead of this method to cause the BOM to be removed.
708
*/
709
inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
710
UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
711
return decoder;
712
}
713
714
/**
715
* Instantiates a new decoder for this encoding with BOM handling disabled
716
* into memory occupied by a previously-instantiated decoder.
717
*
718
* If the input starts with bytes that look like a BOM, those bytes are
719
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
720
* for another encoding.)
721
*
722
* _Note:_ If the caller has performed BOM sniffing on its own but has not
723
* removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
724
* instead of this method to cause the BOM to be removed.
725
*/
726
inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
727
encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
728
}
729
730
/**
731
* Instantiates a new encoder for the output encoding of this encoding.
732
*/
733
inline UniquePtr<Encoder> NewEncoder() const {
734
UniquePtr<Encoder> encoder(encoding_new_encoder(this));
735
return encoder;
736
}
737
738
/**
739
* Instantiates a new encoder for the output encoding of this encoding
740
* into memory occupied by a previously-instantiated encoder.
741
*/
742
inline void NewEncoderInto(Encoder& aEncoder) const {
743
encoding_new_encoder_into(this, &aEncoder);
744
}
745
746
/**
747
* Validates UTF-8.
748
*
749
* Returns the index of the first byte that makes the input malformed as
750
* UTF-8 or the length of the input if the input is entirely valid.
751
*/
752
static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
753
return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
754
}
755
756
/**
757
* Validates ASCII.
758
*
759
* Returns the index of the first byte that makes the input malformed as
760
* ASCII or the length of the input if the input is entirely valid.
761
*/
762
static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
763
return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
764
}
765
766
/**
767
* Validates ISO-2022-JP ASCII-state data.
768
*
769
* Returns the index of the first byte that makes the input not
770
* representable in the ASCII state of ISO-2022-JP or the length of the
771
* input if the input is entirely representable in the ASCII state of
772
* ISO-2022-JP.
773
*/
774
static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
775
return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
776
aBuffer.Length());
777
}
778
779
private:
780
Encoding() = delete;
781
Encoding(const Encoding&) = delete;
782
Encoding& operator=(const Encoding&) = delete;
783
~Encoding() = delete;
784
};
785
786
/**
787
* A converter that decodes a byte stream into Unicode according to a
788
* character encoding in a streaming (incremental) manner.
789
*
790
* The various `Decode*` methods take an input buffer (`aSrc`) and an output
791
* buffer `aDst` both of which are caller-allocated. There are variants for
792
* both UTF-8 and UTF-16 output buffers.
793
*
794
* A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
795
* into `aDst` until one of the following three things happens:
796
*
797
* 1. A malformed byte sequence is encountered (`*WithoutReplacement`
798
* variants only).
799
*
800
* 2. The output buffer has been filled so near capacity that the decoder
801
* cannot be sure that processing an additional byte of input wouldn't
802
* cause so much output that the output buffer would overflow.
803
*
804
* 3. All the input bytes have been processed.
805
*
806
* The `Decode*` method then returns tuple of a status indicating which one
807
* of the three reasons to return happened, how many input bytes were read,
808
* how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
809
* when decoding to UTF-16) were written, and in the case of the
810
* variants performing replacement, a boolean indicating whether an error was
811
* replaced with the REPLACEMENT CHARACTER during the call.
812
*
813
* The number of bytes "written" is what's logically written. Garbage may be
814
* written in the output buffer beyond the point logically written to.
815
*
816
* In the case of the `*WithoutReplacement` variants, the status is a
817
* `uint32_t` whose possible values are packed info about a malformed byte
818
* sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
819
* listed above).
820
*
821
* Packed info about malformed sequences has the following format:
822
* The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
823
* indicate the number of bytes that were consumed after the malformed
824
* sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
825
* the length of the malformed byte sequence (possible decimal values 1, 2,
826
* 3 or 4). The maximum possible sum of the two is 6.
827
*
828
* In the case of methods whose name does not end with
829
* `*WithoutReplacement`, malformed sequences are automatically replaced
830
* with the REPLACEMENT CHARACTER and errors do not cause the methods to
831
* return early.
832
*
833
* When decoding to UTF-8, the output buffer must have at least 4 bytes of
834
* space. When decoding to UTF-16, the output buffer must have at least two
835
* UTF-16 code units (`char16_t`) of space.
836
*
837
* When decoding to UTF-8 without replacement, the methods are guaranteed
838
* not to return indicating that more output space is needed if the length
839
* of the output buffer is at least the length returned by
840
* `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
841
* with replacement, the length of the output buffer that guarantees the
842
* methods not to return indicating that more output space is needed is given
843
* by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
844
* or without replacement, the length of the output buffer that guarantees
845
* the methods not to return indicating that more output space is needed is
846
* given by `MaxUTF16BufferLength()`.
847
*
848
* The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
849
* and the output after each `Decode*` call is guaranteed to consist of
850
* complete characters. (I.e. the code unit sequence for the last character is
851
* guaranteed not to be split across output buffers.)
852
*
853
* The boolean argument `aLast` indicates that the end of the stream is reached
854
* when all the bytes in `aSrc` have been consumed.
855
*
856
* A `Decoder` object can be used to incrementally decode a byte stream.
857
*
858
* During the processing of a single stream, the caller must call `Decode*`
859
* zero or more times with `aLast` set to `false` and then call `Decode*` at
860
* least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
861
* the processing of the stream has ended. Otherwise, the caller must call
862
* `Decode*` again with `aLast` set to `true` (or treat a malformed result,
863
* i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
864
*
865
* Once the stream has ended, the `Decoder` object must not be used anymore.
866
* That is, you need to create another one to process another stream.
867
*
868
* When the decoder returns `kOutputFull` or the decoder returns a malformed
869
* result and the caller does not wish to treat it as a fatal error, the input
870
* buffer `aSrc` may not have been completely consumed. In that case, the caller
871
* must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
872
* call.
873
*
874
* # Infinite loops
875
*
876
* When converting with a fixed-size output buffer whose size is too small to
877
* accommodate one character of output, an infinite loop ensues. When
878
* converting with a fixed-size output buffer, it generally makes sense to
879
* make the buffer fairly large (e.g. couple of kilobytes).
880
*/
881
class Decoder final {
882
public:
883
~Decoder() {}
884
static void operator delete(void* aDecoder) {
885
decoder_free(reinterpret_cast<Decoder*>(aDecoder));
886
}
887
888
/**
889
* The `Encoding` this `Decoder` is for.
890
*
891
* BOM sniffing can change the return value of this method during the life
892
* of the decoder.
893
*/
894
inline NotNull<const mozilla::Encoding*> Encoding() const {
895
return WrapNotNull(decoder_encoding(this));
896
}
897
898
/**
899
* Query the worst-case UTF-8 output size _with replacement_.
900
*
901
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
902
* that will not overflow given the current state of the decoder and
903
* `aByteLength` number of additional input bytes when decoding with
904
* errors handled by outputting a REPLACEMENT CHARACTER for each malformed
905
* sequence.
906
*/
907
inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
908
CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
909
if (max.value() == MaxValue<size_t>::value) {
910
// Mark invalid by overflowing
911
max++;
912
MOZ_ASSERT(!max.isValid());
913
}
914
return max;
915
}
916
917
/**
918
* Query the worst-case UTF-8 output size _without replacement_.
919
*
920
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
921
* that will not overflow given the current state of the decoder and
922
* `aByteLength` number of additional input bytes when decoding without
923
* replacement error handling.
924
*
925
* Note that this value may be too small for the `WithReplacement` case.
926
* Use `MaxUTF8BufferLength()` for that case.
927
*/
928
inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
929
size_t aByteLength) const {
930
CheckedInt<size_t> max(
931
decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
932
if (max.value() == MaxValue<size_t>::value) {
933
// Mark invalid by overflowing
934
max++;
935
MOZ_ASSERT(!max.isValid());
936
}
937
return max;
938
}
939
940
/**
941
* Incrementally decode a byte stream into UTF-8 with malformed sequences
942
* replaced with the REPLACEMENT CHARACTER.
943
*
944
* See the documentation of the class for documentation for `Decode*`
945
* methods collectively.
946
*/
947
inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
948
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
949
size_t srcRead = aSrc.Length();
950
size_t dstWritten = aDst.Length();
951
bool hadReplacements;
952
uint32_t result =
953
decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
954
&dstWritten, aLast, &hadReplacements);
955
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
956
}
957
958
/**
959
* Incrementally decode a byte stream into UTF-8 _without replacement_.
960
*
961
* See the documentation of the class for documentation for `Decode*`
962
* methods collectively.
963
*/
964
inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
965
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
966
size_t srcRead = aSrc.Length();
967
size_t dstWritten = aDst.Length();
968
uint32_t result = decoder_decode_to_utf8_without_replacement(
969
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
970
return MakeTuple(result, srcRead, dstWritten);
971
}
972
973
/**
974
* Query the worst-case UTF-16 output size (with or without replacement).
975
*
976
* Returns the size of the output buffer in UTF-16 code units (`char16_t`)
977
* that will not overflow given the current state of the decoder and
978
* `aByteLength` number of additional input bytes.
979
*
980
* Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
981
* return value of this method applies also in the
982
* `_without_replacement` case.
983
*/
984
inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
985
CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
986
if (max.value() == MaxValue<size_t>::value) {
987
// Mark invalid by overflowing
988
max++;
989
MOZ_ASSERT(!max.isValid());
990
}
991
return max;
992
}
993
994
/**
995
* Incrementally decode a byte stream into UTF-16 with malformed sequences
996
* replaced with the REPLACEMENT CHARACTER.
997
*
998
* See the documentation of the class for documentation for `Decode*`
999
* methods collectively.
1000
*/
1001
inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1002
Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1003
size_t srcRead = aSrc.Length();
1004
size_t dstWritten = aDst.Length();
1005
bool hadReplacements;
1006
uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1007
aDst.Elements(), &dstWritten,
1008
aLast, &hadReplacements);
1009
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1010
}
1011
1012
/**
1013
* Incrementally decode a byte stream into UTF-16 _without replacement_.
1014
*
1015
* See the documentation of the class for documentation for `Decode*`
1016
* methods collectively.
1017
*/
1018
inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1019
Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1020
size_t srcRead = aSrc.Length();
1021
size_t dstWritten = aDst.Length();
1022
uint32_t result = decoder_decode_to_utf16_without_replacement(
1023
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1024
return MakeTuple(result, srcRead, dstWritten);
1025
}
1026
1027
/**
1028
* Checks for compatibility with storing Unicode scalar values as unsigned
1029
* bytes taking into account the state of the decoder.
1030
*
1031
* Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1032
* including waiting for the BOM, or if the encoding is never
1033
* Latin1-byte-compatible.
1034
*
1035
* Otherwise returns the index of the first byte whose unsigned value doesn't
1036
* directly correspond to the decoded Unicode scalar value, or the length
1037
* of the input if all bytes in the input decode directly to scalar values
1038
* corresponding to the unsigned byte values.
1039
*
1040
* Does not change the state of the decoder.
1041
*
1042
* Do not use this unless you are supporting SpiderMonkey-style string
1043
* storage optimizations.
1044
*/
1045
inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1046
Span<const uint8_t> aBuffer) const {
1047
size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1048
aBuffer.Length());
1049
if (upTo == MaxValue<size_t>::value) {
1050
return mozilla::Nothing();
1051
}
1052
return mozilla::Some(upTo);
1053
}
1054
1055
private:
1056
Decoder() = delete;
1057
Decoder(const Decoder&) = delete;
1058
Decoder& operator=(const Decoder&) = delete;
1059
};
1060
1061
/**
1062
* A converter that encodes a Unicode stream into bytes according to a
1063
* character encoding in a streaming (incremental) manner.
1064
*
1065
* The various `Encode*` methods take an input buffer (`aSrc`) and an output
1066
* buffer `aDst` both of which are caller-allocated. There are variants for
1067
* both UTF-8 and UTF-16 input buffers.
1068
*
1069
* An `Encode*` method encode characters from `aSrc` into bytes characters
1070
* stored into `aDst` until one of the following three things happens:
1071
*
1072
* 1. An unmappable character is encountered (`*WithoutReplacement` variants
1073
* only).
1074
*
1075
* 2. The output buffer has been filled so near capacity that the decoder
1076
* cannot be sure that processing an additional character of input wouldn't
1077
* cause so much output that the output buffer would overflow.
1078
*
1079
* 3. All the input characters have been processed.
1080
*
1081
* The `Encode*` method then returns tuple of a status indicating which one
1082
* of the three reasons to return happened, how many input code units (`uint8_t`
1083
* when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1084
* how many output bytes were written, and in the case of the variants that
1085
* perform replacement, a boolean indicating whether an unmappable
1086
* character was replaced with a numeric character reference during the call.
1087
*
1088
* The number of bytes "written" is what's logically written. Garbage may be
1089
* written in the output buffer beyond the point logically written to.
1090
*
1091
* In the case of the methods whose name ends with
1092
* `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1093
* are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1094
* to the three cases listed above).
1095
*
1096
* In the case of methods whose name does not end with
1097
* `*WithoutReplacement`, unmappable characters are automatically replaced
1098
* with the corresponding numeric character references and unmappable
1099
* characters do not cause the methods to return early.
1100
*
1101
* When encoding from UTF-8 without replacement, the methods are guaranteed
1102
* not to return indicating that more output space is needed if the length
1103
* of the output buffer is at least the length returned by
1104
* `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1105
* UTF-8 with replacement, the length of the output buffer that guarantees the
1106
* methods not to return indicating that more output space is needed in the
1107
* absence of unmappable characters is given by
1108
* `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1109
* UTF-16 without replacement, the methods are guaranteed not to return
1110
* indicating that more output space is needed if the length of the output
1111
* buffer is at least the length returned by
1112
* `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1113
* from UTF-16 with replacement, the the length of the output buffer that
1114
* guarantees the methods not to return indicating that more output space is
1115
* needed in the absence of unmappable characters is given by
1116
* `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1117
* When encoding with replacement, applications are not expected to size the
1118
* buffer for the worst case ahead of time but to resize the buffer if there
1119
* are unmappable characters. This is why max length queries are only available
1120
* for the case where there are no unmappable characters.
1121
*
1122
* When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1123
* encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1124
* REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1125
* turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1126
* surrogate pairs are not split across input buffer boundaries.
1127
*
1128
* After an `Encode*` call returns, the output produced so far, taken as a
1129
* whole from the start of the stream, is guaranteed to consist of a valid
1130
* byte sequence in the target encoding. (I.e. the code unit sequence for a
1131
* character is guaranteed not to be split across output buffers. However, due
1132
* to the stateful nature of ISO-2022-JP, the stream needs to be considered
1133
* from the start for it to be valid. For other encodings, the validity holds
1134
* on a per-output buffer basis.)
1135
*
1136
* The boolean argument `aLast` indicates that the end of the stream is reached
1137
* when all the characters in `aSrc` have been consumed. This argument is needed
1138
* for ISO-2022-JP and is ignored for other encodings.
1139
*
1140
* An `Encoder` object can be used to incrementally encode a byte stream.
1141
*
1142
* During the processing of a single stream, the caller must call `Encode*`
1143
* zero or more times with `aLast` set to `false` and then call `Encode*` at
1144
* least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1145
* the processing of the stream has ended. Otherwise, the caller must call
1146
* `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1147
* i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1148
*
1149
* Once the stream has ended, the `Encoder` object must not be used anymore.
1150
* That is, you need to create another one to process another stream.
1151
*
1152
* When the encoder returns `kOutputFull` or the encoder returns an unmappable
1153
* result and the caller does not wish to treat it as a fatal error, the input
1154
* buffer `aSrc` may not have been completely consumed. In that case, the caller
1155
* must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1156
* call.
1157
*
1158
* # Infinite loops
1159
*
1160
* When converting with a fixed-size output buffer whose size is too small to
1161
* accommodate one character of output, an infinite loop ensues. When
1162
* converting with a fixed-size output buffer, it generally makes sense to
1163
* make the buffer fairly large (e.g. couple of kilobytes).
1164
*/
1165
class Encoder final {
1166
public:
1167
~Encoder() {}
1168
1169
static void operator delete(void* aEncoder) {
1170
encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1171
}
1172
1173
/**
1174
* The `Encoding` this `Encoder` is for.
1175
*/
1176
inline NotNull<const mozilla::Encoding*> Encoding() const {
1177
return WrapNotNull(encoder_encoding(this));
1178
}
1179
1180
/**
1181
* Returns `true` if this is an ISO-2022-JP encoder that's not in the
1182
* ASCII state and `false` otherwise.
1183
*/
1184
inline bool HasPendingState() const {
1185
return encoder_has_pending_state(this);
1186
}
1187
1188
/**
1189
* Query the worst-case output size when encoding from UTF-8 with
1190
* replacement.
1191
*
1192
* Returns the size of the output buffer in bytes that will not overflow
1193
* given the current state of the encoder and `aByteLength` number of
1194
* additional input code units if there are no unmappable characters in
1195
* the input.
1196
*/
1197
inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1198
size_t aByteLength) const {
1199
CheckedInt<size_t> max(
1200
encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1201
aByteLength));
1202
if (max.value() == MaxValue<size_t>::value) {
1203
// Mark invalid by overflowing
1204
max++;
1205
MOZ_ASSERT(!max.isValid());
1206
}
1207
return max;
1208
}
1209
1210
/**
1211
* Query the worst-case output size when encoding from UTF-8 without
1212
* replacement.
1213
*
1214
* Returns the size of the output buffer in bytes that will not overflow
1215
* given the current state of the encoder and `aByteLength` number of
1216
* additional input code units.
1217
*/
1218
inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1219
size_t aByteLength) const {
1220
CheckedInt<size_t> max(
1221
encoder_max_buffer_length_from_utf8_without_replacement(this,
1222
aByteLength));
1223
if (max.value() == MaxValue<size_t>::value) {
1224
// Mark invalid by overflowing
1225
max++;
1226
MOZ_ASSERT(!max.isValid());
1227
}
1228
return max;
1229
}
1230
1231
/**
1232
* Incrementally encode into byte stream from UTF-8 with unmappable
1233
* characters replaced with HTML (decimal) numeric character references.
1234
*
1235
* See the documentation of the class for documentation for `Encode*`
1236
* methods collectively.
1237
*
1238
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1239
* The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1240
* absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1241
*/
1242
inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1243
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1244
size_t srcRead = aSrc.Length();
1245
size_t dstWritten = aDst.Length();
1246
bool hadReplacements;
1247
uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1248
aDst.Elements(), &dstWritten,
1249
aLast, &hadReplacements);
1250
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1251
}
1252
1253
/**
1254
* Incrementally encode into byte stream from UTF-8 _without replacement_.
1255
*
1256
* See the documentation of the class for documentation for `Encode*`
1257
* methods collectively.
1258
*
1259
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1260
* The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1261
* absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1262
*/
1263
inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1264
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1265
size_t srcRead = aSrc.Length();
1266
size_t dstWritten = aDst.Length();
1267
uint32_t result = encoder_encode_from_utf8_without_replacement(
1268
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1269
return MakeTuple(result, srcRead, dstWritten);
1270
}
1271
1272
/**
1273
* Query the worst-case output size when encoding from UTF-16 with
1274
* replacement.
1275
*
1276
* Returns the size of the output buffer in bytes that will not overflow
1277
* given the current state of the encoder and `aU16Length` number of
1278
* additional input code units if there are no unmappable characters in
1279
* the input.
1280
*/
1281
inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1282
size_t aU16Length) const {
1283
CheckedInt<size_t> max(
1284
encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1285
aU16Length));
1286
if (max.value() == MaxValue<size_t>::value) {
1287
// Mark invalid by overflowing
1288
max++;
1289
MOZ_ASSERT(!max.isValid());
1290
}
1291
return max;
1292
}
1293
1294
/**
1295
* Query the worst-case output size when encoding from UTF-16 without
1296
* replacement.
1297
*
1298
* Returns the size of the output buffer in bytes that will not overflow
1299
* given the current state of the encoder and `aU16Length` number of
1300
* additional input code units.
1301
*/
1302
inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1303
size_t aU16Length) const {
1304
CheckedInt<size_t> max(
1305
encoder_max_buffer_length_from_utf16_without_replacement(this,
1306
aU16Length));
1307
if (max.value() == MaxValue<size_t>::value) {
1308
// Mark invalid by overflowing
1309
max++;
1310
MOZ_ASSERT(!max.isValid());
1311
}
1312
return max;
1313
}
1314
1315
/**
1316
* Incrementally encode into byte stream from UTF-16 with unmappable
1317
* characters replaced with HTML (decimal) numeric character references.
1318
*
1319
* See the documentation of the class for documentation for `Encode*`
1320
* methods collectively.
1321
*/
1322
inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1323
Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1324
size_t srcRead = aSrc.Length();
1325
size_t dstWritten = aDst.Length();
1326
bool hadReplacements;
1327
uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1328
aDst.Elements(), &dstWritten,
1329
aLast, &hadReplacements);
1330
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1331
}
1332
1333
/**
1334
* Incrementally encode into byte stream from UTF-16 _without replacement_.
1335
*
1336
* See the documentation of the class for documentation for `Encode*`
1337
* methods collectively.
1338
*/
1339
inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1340
Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1341
size_t srcRead = aSrc.Length();
1342
size_t dstWritten = aDst.Length();
1343
uint32_t result = encoder_encode_from_utf16_without_replacement(
1344
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1345
return MakeTuple(result, srcRead, dstWritten);
1346
}
1347
1348
private:
1349
Encoder() = delete;
1350
Encoder(const Encoder&) = delete;
1351
Encoder& operator=(const Encoder&) = delete;
1352
};
1353
1354
}; // namespace mozilla
1355
1356
#endif // mozilla_Encoding_h