Source code

Revision control

Other Tools

1
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
// Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11
// "top-level directory" in the above notice refers to
12
// third_party/rust/encoding_c/.
13
14
#ifndef mozilla_Encoding_h
15
#define mozilla_Encoding_h
16
17
#include "mozilla/CheckedInt.h"
18
#include "mozilla/NotNull.h"
19
#include "mozilla/Span.h"
20
#include "mozilla/Tuple.h"
21
#include "nsString.h"
22
23
namespace mozilla {
24
class Encoding;
25
class Decoder;
26
class Encoder;
27
}; // namespace mozilla
28
29
#define ENCODING_RS_ENCODING mozilla::Encoding
30
#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
31
mozilla::NotNull<const mozilla::Encoding*>
32
#define ENCODING_RS_ENCODER mozilla::Encoder
33
#define ENCODING_RS_DECODER mozilla::Decoder
34
35
#include "encoding_rs.h"
36
37
extern "C" {
38
39
nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
40
uint8_t const* src, size_t src_len,
41
nsAString* dst);
42
43
nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
44
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
45
nsAString* dst);
46
47
nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
48
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
49
nsAString* dst);
50
51
nsresult
52
mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
53
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
54
nsAString* dst);
55
56
nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
57
char16_t const* src, size_t src_len,
58
nsACString* dst);
59
60
nsresult mozilla_encoding_decode_to_nscstring(
61
mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
62
63
nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
64
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
65
66
nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
67
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
68
69
nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
70
mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
71
nsACString* dst, size_t already_validated);
72
73
nsresult
74
mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
75
mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
76
77
nsresult mozilla_encoding_encode_from_nscstring(
78
mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
79
80
} // extern "C"
81
82
namespace mozilla {
83
84
/**
85
* Return value from `Decoder`/`Encoder` to indicate that input
86
* was exhausted.
87
*/
88
const uint32_t kInputEmpty = INPUT_EMPTY;
89
90
/**
91
* Return value from `Decoder`/`Encoder` to indicate that output
92
* space was insufficient.
93
*/
94
const uint32_t kOutputFull = OUTPUT_FULL;
95
96
/**
97
* An encoding as defined in the Encoding Standard
99
*
100
* See https://docs.rs/encoding_rs/ for the Rust API docs.
101
*
102
* An _encoding_ defines a mapping from a byte sequence to a Unicode code point
103
* sequence and, in most cases, vice versa. Each encoding has a name, an output
104
* encoding, and one or more labels.
105
*
106
* _Labels_ are ASCII-case-insensitive strings that are used to identify an
107
* encoding in formats and protocols. The _name_ of the encoding is the
108
* preferred label in the case appropriate for returning from the
109
* `characterSet` property of the `Document` DOM interface, except for
110
* the replacement encoding whose name is not one of its labels.
111
*
112
* The _output encoding_ is the encoding used for form submission and URL
113
* parsing on Web pages in the encoding. This is UTF-8 for the replacement,
114
* UTF-16LE and UTF-16BE encodings and the encoding itself for other
115
* encodings.
116
*
117
* # Streaming vs. Non-Streaming
118
*
119
* When you have the entire input in a single buffer, you can use the
120
* methods `Decode()`, `DecodeWithBOMRemoval()`,
121
* `DecodeWithoutBOMHandling()`,
122
* `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
123
* `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
124
* NewEncoder()` methods), these methods perform heap allocations. You should
125
* the `Decoder` and `Encoder` objects when your input is split into multiple
126
* buffers or when you want to control the allocation of the output buffers.
127
*
128
* # Instances
129
*
130
* All instances of `Encoding` are statically allocated and have the process's
131
* lifetime. There is precisely one unique `Encoding` instance for each
132
* encoding defined in the Encoding Standard.
133
*
134
* To obtain a reference to a particular encoding whose identity you know at
135
* compile time, use a `static` that refers to encoding. There is a `static`
136
* for each encoding. The `static`s are named in all caps with hyphens
137
* replaced with underscores and with `_ENCODING` appended to the
138
* name. For example, if you know at compile time that you will want to
139
* decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
140
*
141
* If you don't know what encoding you need at compile time and need to
142
* dynamically get an encoding by label, use `Encoding::for_label()`.
143
*
144
* Pointers to `Encoding` can be compared with `==` to check for the sameness
145
* of two encodings.
146
*
147
* A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
148
* to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
149
* `const mozilla::Encoding*` in the C signature and
150
* `*const encoding_rs::Encoding` is the corresponding Rust signature.
151
*/
152
class Encoding final {
153
public:
154
/**
155
* Implements the _get an encoding_ algorithm
157
*
158
* If, after ASCII-lowercasing and removing leading and trailing
159
* whitespace, the argument matches a label defined in the Encoding
160
* Standard, `const Encoding*` representing the corresponding
161
* encoding is returned. If there is no match, `nullptr` is returned.
162
*
163
* This is the right method to use if the action upon the method returning
164
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
165
* instead. When the action upon the method returning `nullptr` is not to
166
* proceed with a fallback but to refuse processing,
167
* `ForLabelNoReplacement()` is more appropriate.
168
*/
169
static inline const Encoding* ForLabel(Span<const char> aLabel) {
170
return encoding_for_label(
171
reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
172
}
173
174
/**
175
* `nsAString` argument version. See above for docs.
176
*/
177
static inline const Encoding* ForLabel(const nsAString& aLabel) {
178
return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
179
}
180
181
/**
182
* This method behaves the same as `ForLabel()`, except when `ForLabel()`
183
* would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
184
*
185
* This method is useful in scenarios where a fatal error is required
186
* upon invalid label, because in those cases the caller typically wishes
187
* to treat the labels that map to the replacement encoding as fatal
188
* errors, too.
189
*
190
* It is not OK to use this method when the action upon the method returning
191
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
192
* such a case, the `ForLabel()` method should be used instead in order to
193
* avoid unsafe fallback for labels that `ForLabel()` maps to
194
* `REPLACEMENT_ENCODING`.
195
*/
196
static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
197
return encoding_for_label_no_replacement(
198
reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
199
}
200
201
/**
202
* `nsAString` argument version. See above for docs.
203
*/
204
static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
205
return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
206
}
207
208
/**
209
* Performs non-incremental BOM sniffing.
210
*
211
* The argument must either be a buffer representing the entire input
212
* stream (non-streaming case) or a buffer representing at least the first
213
* three bytes of the input stream (streaming case).
214
*
215
* Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
216
* or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
217
* UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
218
*/
219
static inline Tuple<const Encoding*, size_t> ForBOM(
220
Span<const uint8_t> aBuffer) {
221
size_t len = aBuffer.Length();
222
const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
223
return MakeTuple(encoding, len);
224
}
225
226
/**
227
* Writes the name of this encoding into `aName`.
228
*
229
* This name is appropriate to return as-is from the DOM
230
* `document.characterSet` property.
231
*/
232
inline void Name(nsACString& aName) const {
233
aName.SetLength(ENCODING_NAME_MAX_LENGTH);
234
size_t length =
235
encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
236
aName.SetLength(length); // truncation is the 64-bit case is OK
237
}
238
239
/**
240
* Checks whether the _output encoding_ of this encoding can encode every
241
* Unicode code point. (Only true if the output encoding is UTF-8.)
242
*/
243
inline bool CanEncodeEverything() const {
244
return encoding_can_encode_everything(this);
245
}
246
247
/**
248
* Checks whether the bytes 0x00...0x7F map exclusively to the characters
249
* U+0000...U+007F and vice versa.
250
*/
251
inline bool IsAsciiCompatible() const {
252
return encoding_is_ascii_compatible(this);
253
}
254
255
/**
256
* Checks whether this is a Japanese legacy encoding.
257
*/
258
inline bool IsJapaneseLegacy() const {
259
return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
260
this == ISO_2022_JP_ENCODING;
261
}
262
263
/**
264
* Returns the _output encoding_ of this encoding. This is UTF-8 for
265
* UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
266
*/
267
inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
268
return WrapNotNull(encoding_output_encoding(this));
269
}
270
271
/**
272
* Decode complete input to `nsACString` _with BOM sniffing_ and with
273
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
274
* entire input is available as a single buffer (i.e. the end of the
275
* buffer marks the end of the stream).
276
*
277
* This method implements the (non-streaming version of) the
278
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
279
*
280
* The second item in the returned tuple is the encoding that was actually
281
* used (which may differ from this encoding thanks to BOM sniffing).
282
*
283
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
284
* if there were malformed sequences (that were replaced with the
285
* REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
286
* tuple.
287
*
288
* The backing buffer of the string isn't copied if the input buffer
289
* is heap-allocated and decoding from UTF-8 and the input is valid
290
* BOMless UTF-8, decoding from an ASCII-compatible encoding and
291
* the input is valid ASCII or decoding from ISO-2022-JP and the
292
* input stays in the ASCII state of ISO-2022-JP. It is OK to pass
293
* the same string as both arguments.
294
*
295
* _Note:_ It is wrong to use this when the input buffer represents only
296
* a segment of the input instead of the whole input. Use `NewDecoder()`
297
* when decoding segmented input.
298
*/
299
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
300
const nsACString& aBytes, nsACString& aOut) const {
301
const Encoding* encoding = this;
302
const nsACString* bytes = &aBytes;
303
nsACString* out = &aOut;
304
nsresult rv;
305
if (bytes == out) {
306
nsAutoCString temp(aBytes);
307
rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
308
} else {
309
rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
310
}
311
return MakeTuple(rv, WrapNotNull(encoding));
312
}
313
314
/**
315
* Decode complete input to `nsAString` _with BOM sniffing_ and with
316
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
317
* entire input is available as a single buffer (i.e. the end of the
318
* buffer marks the end of the stream).
319
*
320
* This method implements the (non-streaming version of) the
321
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
322
*
323
* The second item in the returned tuple is the encoding that was actually
324
* used (which may differ from this encoding thanks to BOM sniffing).
325
*
326
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
327
* if there were malformed sequences (that were replaced with the
328
* REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
329
* tuple.
330
*
331
* _Note:_ It is wrong to use this when the input buffer represents only
332
* a segment of the input instead of the whole input. Use `NewDecoder()`
333
* when decoding segmented input.
334
*/
335
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
336
Span<const uint8_t> aBytes, nsAString& aOut) const {
337
const Encoding* encoding = this;
338
nsresult rv = mozilla_encoding_decode_to_nsstring(
339
&encoding, aBytes.Elements(), aBytes.Length(), &aOut);
340
return MakeTuple(rv, WrapNotNull(encoding));
341
}
342
343
/**
344
* Decode complete input to `nsACString` _with BOM removal_ and with
345
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
346
* entire input is available as a single buffer (i.e. the end of the
347
* buffer marks the end of the stream).
348
*
349
* When invoked on `UTF_8`, this method implements the (non-streaming
350
* version of) the _UTF-8 decode_
352
*
353
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
354
* if there were malformed sequences (that were replaced with the
355
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
356
*
357
* The backing buffer of the string isn't copied if the input buffer
358
* is heap-allocated and decoding from UTF-8 and the input is valid
359
* BOMless UTF-8, decoding from an ASCII-compatible encoding and
360
* the input is valid ASCII or decoding from ISO-2022-JP and the
361
* input stays in the ASCII state of ISO-2022-JP. It is OK to pass
362
* the same string as both arguments.
363
*
364
* _Note:_ It is wrong to use this when the input buffer represents only
365
* a segment of the input instead of the whole input. Use
366
* `NewDecoderWithBOMRemoval()` when decoding segmented input.
367
*/
368
inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
369
nsACString& aOut) const {
370
const nsACString* bytes = &aBytes;
371
nsACString* out = &aOut;
372
if (bytes == out) {
373
nsAutoCString temp(aBytes);
374
return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
375
out);
376
}
377
return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
378
out);
379
}
380
381
/**
382
* Decode complete input to `nsAString` _with BOM removal_ and with
383
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
384
* entire input is available as a single buffer (i.e. the end of the
385
* buffer marks the end of the stream).
386
*
387
* When invoked on `UTF_8`, this method implements the (non-streaming
388
* version of) the _UTF-8 decode_
390
*
391
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
392
* if there were malformed sequences (that were replaced with the
393
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
394
*
395
* _Note:_ It is wrong to use this when the input buffer represents only
396
* a segment of the input instead of the whole input. Use
397
* `NewDecoderWithBOMRemoval()` when decoding segmented input.
398
*/
399
inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
400
nsAString& aOut) const {
401
return mozilla_encoding_decode_to_nsstring_with_bom_removal(
402
this, aBytes.Elements(), aBytes.Length(), &aOut);
403
}
404
405
/**
406
* Decode complete input to `nsACString` _without BOM handling_ and
407
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
408
* the entire input is available as a single buffer (i.e. the end of the
409
* buffer marks the end of the stream).
410
*
411
* When invoked on `UTF_8`, this method implements the (non-streaming
412
* version of) the _UTF-8 decode without BOM_
414
*
415
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
416
* if there were malformed sequences (that were replaced with the
417
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
418
*
419
* The backing buffer of the string isn't copied if the input buffer
420
* is heap-allocated and decoding from UTF-8 and the input is valid
421
* UTF-8, decoding from an ASCII-compatible encoding and the input
422
* is valid ASCII or decoding from ISO-2022-JP and the input stays
423
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
424
* as both arguments.
425
*
426
* _Note:_ It is wrong to use this when the input buffer represents only
427
* a segment of the input instead of the whole input. Use
428
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
429
*/
430
inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
431
nsACString& aOut) const {
432
const nsACString* bytes = &aBytes;
433
nsACString* out = &aOut;
434
if (bytes == out) {
435
nsAutoCString temp(aBytes);
436
return mozilla_encoding_decode_to_nscstring_without_bom_handling(
437
this, &temp, out);
438
}
439
return mozilla_encoding_decode_to_nscstring_without_bom_handling(
440
this, bytes, out);
441
}
442
443
/**
444
* Decode complete input to `nsAString` _without BOM handling_ and
445
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
446
* the entire input is available as a single buffer (i.e. the end of the
447
* buffer marks the end of the stream).
448
*
449
* When invoked on `UTF_8`, this method implements the (non-streaming
450
* version of) the _UTF-8 decode without BOM_
452
*
453
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
454
* if there were malformed sequences (that were replaced with the
455
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
456
*
457
* _Note:_ It is wrong to use this when the input buffer represents only
458
* a segment of the input instead of the whole input. Use
459
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
460
*/
461
inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
462
nsAString& aOut) const {
463
return mozilla_encoding_decode_to_nsstring_without_bom_handling(
464
this, aBytes.Elements(), aBytes.Length(), &aOut);
465
}
466
467
/**
468
* Decode complete input to `nsACString` _without BOM handling_ and
469
* _with malformed sequences treated as fatal_ when the entire input is
470
* available as a single buffer (i.e. the end of the buffer marks the end
471
* of the stream).
472
*
473
* When invoked on `UTF_8`, this method implements the (non-streaming
474
* version of) the _UTF-8 decode without BOM or fail_
476
* spec concept.
477
*
478
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
479
* if a malformed sequence was encountered and `NS_OK` otherwise.
480
*
481
* The backing buffer of the string isn't copied if the input buffer
482
* is heap-allocated and decoding from UTF-8 and the input is valid
483
* UTF-8, decoding from an ASCII-compatible encoding and the input
484
* is valid ASCII or decoding from ISO-2022-JP and the input stays
485
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
486
* as both arguments.
487
*
488
* _Note:_ It is wrong to use this when the input buffer represents only
489
* a segment of the input instead of the whole input. Use
490
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
491
*/
492
inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
493
const nsACString& aBytes, nsACString& aOut) const {
494
const nsACString* bytes = &aBytes;
495
nsACString* out = &aOut;
496
if (bytes == out) {
497
nsAutoCString temp(aBytes);
498
return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
499
this, &temp, out);
500
}
501
return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
502
this, bytes, out);
503
}
504
505
/**
506
* Decode complete input to `nsACString` _without BOM handling_ and
507
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
508
* the entire input is available as a single buffer (i.e. the end of the
509
* buffer marks the end of the stream) _asserting that a number of bytes
510
* from the start are already known to be valid UTF-8_.
511
*
512
* The use case for this method is avoiding copying when dealing with
513
* input that has a UTF-8 BOM. _When in doubt, do not use this method._
514
*
515
* When invoked on `UTF_8`, this method implements the (non-streaming
516
* version of) the _UTF-8 decode without BOM_
518
*
519
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
520
* if there were malformed sequences (that were replaced with the
521
* REPLACEMENT CHARACTER) and `NS_OK` otherwise.
522
*
523
* _Note:_ It is wrong to use this when the input buffer represents only
524
* a segment of the input instead of the whole input. Use
525
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
526
*
527
* # Safety
528
*
529
* The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
530
* `aBytes` _must not_ alias the buffer (if any) of `aOut`.
531
*/
532
inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
533
nsACString& aOut,
534
size_t aAlreadyValidated) const {
535
return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
536
this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
537
}
538
539
/**
540
* Decode complete input to `nsAString` _without BOM handling_ and
541
* _with malformed sequences treated as fatal_ when the entire input is
542
* available as a single buffer (i.e. the end of the buffer marks the end
543
* of the stream).
544
*
545
* When invoked on `UTF_8`, this method implements the (non-streaming
546
* version of) the _UTF-8 decode without BOM or fail_
548
* spec concept.
549
*
550
* Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
551
* if a malformed sequence was encountered and `NS_OK` otherwise.
552
*
553
* _Note:_ It is wrong to use this when the input buffer represents only
554
* a segment of the input instead of the whole input. Use
555
* `NewDecoderWithoutBOMHandling()` when decoding segmented input.
556
*/
557
inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
558
Span<const uint8_t> aBytes, nsAString& aOut) const {
559
return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
560
this, aBytes.Elements(), aBytes.Length(), &aOut);
561
}
562
563
/**
564
* Encode complete input to `nsACString` with unmappable characters
565
* replaced with decimal numeric character references when the entire input
566
* is available as a single buffer (i.e. the end of the buffer marks the
567
* end of the stream).
568
*
569
* This method implements the (non-streaming version of) the
570
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
571
*
572
* The second item in the returned tuple is the encoding that was actually
573
* used (which may differ from this encoding thanks to some encodings
574
* having UTF-8 as their output encoding).
575
*
576
* The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
577
* the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
578
* `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
579
* replaced with numeric character references) and `NS_OK` otherwise.
580
*
581
* The backing buffer of the string isn't copied if the input buffer
582
* is heap-allocated and encoding to UTF-8 and the input is valid
583
* UTF-8, encoding to an ASCII-compatible encoding and the input
584
* is valid ASCII or encoding from ISO-2022-JP and the input stays
585
* in the ASCII state of ISO-2022-JP. It is OK to pass the same string
586
* as both arguments.
587
*
588
* _Note:_ It is wrong to use this when the input buffer represents only
589
* a segment of the input instead of the whole input. Use `NewEncoder()`
590
* when encoding segmented output.
591
*/
592
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
593
const nsACString& aString, nsACString& aOut) const {
594
const Encoding* encoding = this;
595
const nsACString* string = &aString;
596
nsACString* out = &aOut;
597
nsresult rv;
598
if (string == out) {
599
nsAutoCString temp(aString);
600
rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
601
} else {
602
rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
603
}
604
return MakeTuple(rv, WrapNotNull(encoding));
605
}
606
607
/**
608
* Encode complete input to `nsACString` with unmappable characters
609
* replaced with decimal numeric character references when the entire input
610
* is available as a single buffer (i.e. the end of the buffer marks the
611
* end of the stream).
612
*
613
* This method implements the (non-streaming version of) the
614
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
615
*
616
* The second item in the returned tuple is the encoding that was actually
617
* used (which may differ from this encoding thanks to some encodings
618
* having UTF-8 as their output encoding).
619
*
620
* The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
621
* OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
622
* were replaced with numeric character references) and `NS_OK` otherwise.
623
624
* _Note:_ It is wrong to use this when the input buffer represents only
625
* a segment of the input instead of the whole input. Use `NewEncoder()`
626
* when encoding segmented output.
627
*/
628
inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
629
Span<const char16_t> aString, nsACString& aOut) const {
630
const Encoding* encoding = this;
631
nsresult rv = mozilla_encoding_encode_from_utf16(
632
&encoding, aString.Elements(), aString.Length(), &aOut);
633
return MakeTuple(rv, WrapNotNull(encoding));
634
}
635
636
/**
637
* Instantiates a new decoder for this encoding with BOM sniffing enabled.
638
*
639
* BOM sniffing may cause the returned decoder to morph into a decoder
640
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
641
*/
642
inline UniquePtr<Decoder> NewDecoder() const {
643
UniquePtr<Decoder> decoder(encoding_new_decoder(this));
644
return decoder;
645
}
646
647
/**
648
* Instantiates a new decoder for this encoding with BOM sniffing enabled
649
* into memory occupied by a previously-instantiated decoder.
650
*
651
* BOM sniffing may cause the returned decoder to morph into a decoder
652
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
653
*/
654
inline void NewDecoderInto(Decoder& aDecoder) const {
655
encoding_new_decoder_into(this, &aDecoder);
656
}
657
658
/**
659
* Instantiates a new decoder for this encoding with BOM removal.
660
*
661
* If the input starts with bytes that are the BOM for this encoding,
662
* those bytes are removed. However, the decoder never morphs into a
663
* decoder for another encoding: A BOM for another encoding is treated as
664
* (potentially malformed) input to the decoding algorithm for this
665
* encoding.
666
*/
667
inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
668
UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
669
return decoder;
670
}
671
672
/**
673
* Instantiates a new decoder for this encoding with BOM removal
674
* into memory occupied by a previously-instantiated decoder.
675
*
676
* If the input starts with bytes that are the BOM for this encoding,
677
* those bytes are removed. However, the decoder never morphs into a
678
* decoder for another encoding: A BOM for another encoding is treated as
679
* (potentially malformed) input to the decoding algorithm for this
680
* encoding.
681
*/
682
inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
683
encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
684
}
685
686
/**
687
* Instantiates a new decoder for this encoding with BOM handling disabled.
688
*
689
* If the input starts with bytes that look like a BOM, those bytes are
690
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
691
* for another encoding.)
692
*
693
* _Note:_ If the caller has performed BOM sniffing on its own but has not
694
* removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
695
* instead of this method to cause the BOM to be removed.
696
*/
697
inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
698
UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
699
return decoder;
700
}
701
702
/**
703
* Instantiates a new decoder for this encoding with BOM handling disabled
704
* into memory occupied by a previously-instantiated decoder.
705
*
706
* If the input starts with bytes that look like a BOM, those bytes are
707
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
708
* for another encoding.)
709
*
710
* _Note:_ If the caller has performed BOM sniffing on its own but has not
711
* removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
712
* instead of this method to cause the BOM to be removed.
713
*/
714
inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
715
encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
716
}
717
718
/**
719
* Instantiates a new encoder for the output encoding of this encoding.
720
*/
721
inline UniquePtr<Encoder> NewEncoder() const {
722
UniquePtr<Encoder> encoder(encoding_new_encoder(this));
723
return encoder;
724
}
725
726
/**
727
* Instantiates a new encoder for the output encoding of this encoding
728
* into memory occupied by a previously-instantiated encoder.
729
*/
730
inline void NewEncoderInto(Encoder& aEncoder) const {
731
encoding_new_encoder_into(this, &aEncoder);
732
}
733
734
/**
735
* Validates UTF-8.
736
*
737
* Returns the index of the first byte that makes the input malformed as
738
* UTF-8 or the length of the input if the input is entirely valid.
739
*/
740
static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
741
return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
742
}
743
744
/**
745
* Validates ASCII.
746
*
747
* Returns the index of the first byte that makes the input malformed as
748
* ASCII or the length of the input if the input is entirely valid.
749
*/
750
static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
751
return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
752
}
753
754
/**
755
* Validates ISO-2022-JP ASCII-state data.
756
*
757
* Returns the index of the first byte that makes the input not
758
* representable in the ASCII state of ISO-2022-JP or the length of the
759
* input if the input is entirely representable in the ASCII state of
760
* ISO-2022-JP.
761
*/
762
static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
763
return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
764
aBuffer.Length());
765
}
766
767
private:
768
Encoding() = delete;
769
Encoding(const Encoding&) = delete;
770
Encoding& operator=(const Encoding&) = delete;
771
~Encoding() = delete;
772
};
773
774
/**
775
* A converter that decodes a byte stream into Unicode according to a
776
* character encoding in a streaming (incremental) manner.
777
*
778
* The various `Decode*` methods take an input buffer (`aSrc`) and an output
779
* buffer `aDst` both of which are caller-allocated. There are variants for
780
* both UTF-8 and UTF-16 output buffers.
781
*
782
* A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
783
* into `aDst` until one of the following three things happens:
784
*
785
* 1. A malformed byte sequence is encountered (`*WithoutReplacement`
786
* variants only).
787
*
788
* 2. The output buffer has been filled so near capacity that the decoder
789
* cannot be sure that processing an additional byte of input wouldn't
790
* cause so much output that the output buffer would overflow.
791
*
792
* 3. All the input bytes have been processed.
793
*
794
* The `Decode*` method then returns tuple of a status indicating which one
795
* of the three reasons to return happened, how many input bytes were read,
796
* how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
797
* when decoding to UTF-16) were written, and in the case of the
798
* variants performing replacement, a boolean indicating whether an error was
799
* replaced with the REPLACEMENT CHARACTER during the call.
800
*
801
* The number of bytes "written" is what's logically written. Garbage may be
802
* written in the output buffer beyond the point logically written to.
803
*
804
* In the case of the `*WithoutReplacement` variants, the status is a
805
* `uint32_t` whose possible values are packed info about a malformed byte
806
* sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
807
* listed above).
808
*
809
* Packed info about malformed sequences has the following format:
810
* The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
811
* indicate the number of bytes that were consumed after the malformed
812
* sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
813
* the length of the malformed byte sequence (possible decimal values 1, 2,
814
* 3 or 4). The maximum possible sum of the two is 6.
815
*
816
* In the case of methods whose name does not end with
817
* `*WithoutReplacement`, malformed sequences are automatically replaced
818
* with the REPLACEMENT CHARACTER and errors do not cause the methods to
819
* return early.
820
*
821
* When decoding to UTF-8, the output buffer must have at least 4 bytes of
822
* space. When decoding to UTF-16, the output buffer must have at least two
823
* UTF-16 code units (`char16_t`) of space.
824
*
825
* When decoding to UTF-8 without replacement, the methods are guaranteed
826
* not to return indicating that more output space is needed if the length
827
* of the output buffer is at least the length returned by
828
* `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
829
* with replacement, the length of the output buffer that guarantees the
830
* methods not to return indicating that more output space is needed is given
831
* by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
832
* or without replacement, the length of the output buffer that guarantees
833
* the methods not to return indicating that more output space is needed is
834
* given by `MaxUTF16BufferLength()`.
835
*
836
* The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
837
* and the output after each `Decode*` call is guaranteed to consist of
838
* complete characters. (I.e. the code unit sequence for the last character is
839
* guaranteed not to be split across output buffers.)
840
*
841
* The boolean argument `aLast` indicates that the end of the stream is reached
842
* when all the bytes in `aSrc` have been consumed.
843
*
844
* A `Decoder` object can be used to incrementally decode a byte stream.
845
*
846
* During the processing of a single stream, the caller must call `Decode*`
847
* zero or more times with `aLast` set to `false` and then call `Decode*` at
848
* least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
849
* the processing of the stream has ended. Otherwise, the caller must call
850
* `Decode*` again with `aLast` set to `true` (or treat a malformed result,
851
* i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
852
*
853
* Once the stream has ended, the `Decoder` object must not be used anymore.
854
* That is, you need to create another one to process another stream.
855
*
856
* When the decoder returns `kOutputFull` or the decoder returns a malformed
857
* result and the caller does not wish to treat it as a fatal error, the input
858
* buffer `aSrc` may not have been completely consumed. In that case, the caller
859
* must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
860
* call.
861
*
862
* # Infinite loops
863
*
864
* When converting with a fixed-size output buffer whose size is too small to
865
* accommodate one character of output, an infinite loop ensues. When
866
* converting with a fixed-size output buffer, it generally makes sense to
867
* make the buffer fairly large (e.g. couple of kilobytes).
868
*/
869
class Decoder final {
870
public:
871
~Decoder() {}
872
static void operator delete(void* aDecoder) {
873
decoder_free(reinterpret_cast<Decoder*>(aDecoder));
874
}
875
876
/**
877
* The `Encoding` this `Decoder` is for.
878
*
879
* BOM sniffing can change the return value of this method during the life
880
* of the decoder.
881
*/
882
inline NotNull<const mozilla::Encoding*> Encoding() const {
883
return WrapNotNull(decoder_encoding(this));
884
}
885
886
/**
887
* Query the worst-case UTF-8 output size _with replacement_.
888
*
889
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
890
* that will not overflow given the current state of the decoder and
891
* `aByteLength` number of additional input bytes when decoding with
892
* errors handled by outputting a REPLACEMENT CHARACTER for each malformed
893
* sequence.
894
*/
895
inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
896
CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
897
if (max.value() == MaxValue<size_t>::value) {
898
// Mark invalid by overflowing
899
max++;
900
MOZ_ASSERT(!max.isValid());
901
}
902
return max;
903
}
904
905
/**
906
* Query the worst-case UTF-8 output size _without replacement_.
907
*
908
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
909
* that will not overflow given the current state of the decoder and
910
* `aByteLength` number of additional input bytes when decoding without
911
* replacement error handling.
912
*
913
* Note that this value may be too small for the `WithReplacement` case.
914
* Use `MaxUTF8BufferLength()` for that case.
915
*/
916
inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
917
size_t aByteLength) const {
918
CheckedInt<size_t> max(
919
decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
920
if (max.value() == MaxValue<size_t>::value) {
921
// Mark invalid by overflowing
922
max++;
923
MOZ_ASSERT(!max.isValid());
924
}
925
return max;
926
}
927
928
/**
929
* Incrementally decode a byte stream into UTF-8 with malformed sequences
930
* replaced with the REPLACEMENT CHARACTER.
931
*
932
* See the documentation of the class for documentation for `Decode*`
933
* methods collectively.
934
*/
935
inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
936
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
937
size_t srcRead = aSrc.Length();
938
size_t dstWritten = aDst.Length();
939
bool hadReplacements;
940
uint32_t result =
941
decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
942
&dstWritten, aLast, &hadReplacements);
943
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
944
}
945
946
/**
947
* Incrementally decode a byte stream into UTF-8 _without replacement_.
948
*
949
* See the documentation of the class for documentation for `Decode*`
950
* methods collectively.
951
*/
952
inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
953
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
954
size_t srcRead = aSrc.Length();
955
size_t dstWritten = aDst.Length();
956
uint32_t result = decoder_decode_to_utf8_without_replacement(
957
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
958
return MakeTuple(result, srcRead, dstWritten);
959
}
960
961
/**
962
* Query the worst-case UTF-16 output size (with or without replacement).
963
*
964
* Returns the size of the output buffer in UTF-16 code units (`char16_t`)
965
* that will not overflow given the current state of the decoder and
966
* `aByteLength` number of additional input bytes.
967
*
968
* Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
969
* return value of this method applies also in the
970
* `_without_replacement` case.
971
*/
972
inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
973
CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
974
if (max.value() == MaxValue<size_t>::value) {
975
// Mark invalid by overflowing
976
max++;
977
MOZ_ASSERT(!max.isValid());
978
}
979
return max;
980
}
981
982
/**
983
* Incrementally decode a byte stream into UTF-16 with malformed sequences
984
* replaced with the REPLACEMENT CHARACTER.
985
*
986
* See the documentation of the class for documentation for `Decode*`
987
* methods collectively.
988
*/
989
inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
990
Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
991
size_t srcRead = aSrc.Length();
992
size_t dstWritten = aDst.Length();
993
bool hadReplacements;
994
uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
995
aDst.Elements(), &dstWritten,
996
aLast, &hadReplacements);
997
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
998
}
999
1000
/**
1001
* Incrementally decode a byte stream into UTF-16 _without replacement_.
1002
*
1003
* See the documentation of the class for documentation for `Decode*`
1004
* methods collectively.
1005
*/
1006
inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1007
Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1008
size_t srcRead = aSrc.Length();
1009
size_t dstWritten = aDst.Length();
1010
uint32_t result = decoder_decode_to_utf16_without_replacement(
1011
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1012
return MakeTuple(result, srcRead, dstWritten);
1013
}
1014
1015
private:
1016
Decoder() = delete;
1017
Decoder(const Decoder&) = delete;
1018
Decoder& operator=(const Decoder&) = delete;
1019
};
1020
1021
/**
1022
* A converter that encodes a Unicode stream into bytes according to a
1023
* character encoding in a streaming (incremental) manner.
1024
*
1025
* The various `Encode*` methods take an input buffer (`aSrc`) and an output
1026
* buffer `aDst` both of which are caller-allocated. There are variants for
1027
* both UTF-8 and UTF-16 input buffers.
1028
*
1029
* An `Encode*` method encode characters from `aSrc` into bytes characters
1030
* stored into `aDst` until one of the following three things happens:
1031
*
1032
* 1. An unmappable character is encountered (`*WithoutReplacement` variants
1033
* only).
1034
*
1035
* 2. The output buffer has been filled so near capacity that the decoder
1036
* cannot be sure that processing an additional character of input wouldn't
1037
* cause so much output that the output buffer would overflow.
1038
*
1039
* 3. All the input characters have been processed.
1040
*
1041
* The `Encode*` method then returns tuple of a status indicating which one
1042
* of the three reasons to return happened, how many input code units (`uint8_t`
1043
* when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1044
* how many output bytes were written, and in the case of the variants that
1045
* perform replacement, a boolean indicating whether an unmappable
1046
* character was replaced with a numeric character reference during the call.
1047
*
1048
* The number of bytes "written" is what's logically written. Garbage may be
1049
* written in the output buffer beyond the point logically written to.
1050
*
1051
* In the case of the methods whose name ends with
1052
* `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1053
* are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1054
* to the three cases listed above).
1055
*
1056
* In the case of methods whose name does not end with
1057
* `*WithoutReplacement`, unmappable characters are automatically replaced
1058
* with the corresponding numeric character references and unmappable
1059
* characters do not cause the methods to return early.
1060
*
1061
* When encoding from UTF-8 without replacement, the methods are guaranteed
1062
* not to return indicating that more output space is needed if the length
1063
* of the output buffer is at least the length returned by
1064
* `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1065
* UTF-8 with replacement, the length of the output buffer that guarantees the
1066
* methods not to return indicating that more output space is needed in the
1067
* absence of unmappable characters is given by
1068
* `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1069
* UTF-16 without replacement, the methods are guaranteed not to return
1070
* indicating that more output space is needed if the length of the output
1071
* buffer is at least the length returned by
1072
* `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1073
* from UTF-16 with replacement, the the length of the output buffer that
1074
* guarantees the methods not to return indicating that more output space is
1075
* needed in the absence of unmappable characters is given by
1076
* `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1077
* When encoding with replacement, applications are not expected to size the
1078
* buffer for the worst case ahead of time but to resize the buffer if there
1079
* are unmappable characters. This is why max length queries are only available
1080
* for the case where there are no unmappable characters.
1081
*
1082
* When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1083
* encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1084
* REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1085
* turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1086
* surrogate pairs are not split across input buffer boundaries.
1087
*
1088
* After an `Encode*` call returns, the output produced so far, taken as a
1089
* whole from the start of the stream, is guaranteed to consist of a valid
1090
* byte sequence in the target encoding. (I.e. the code unit sequence for a
1091
* character is guaranteed not to be split across output buffers. However, due
1092
* to the stateful nature of ISO-2022-JP, the stream needs to be considered
1093
* from the start for it to be valid. For other encodings, the validity holds
1094
* on a per-output buffer basis.)
1095
*
1096
* The boolean argument `aLast` indicates that the end of the stream is reached
1097
* when all the characters in `aSrc` have been consumed. This argument is needed
1098
* for ISO-2022-JP and is ignored for other encodings.
1099
*
1100
* An `Encoder` object can be used to incrementally encode a byte stream.
1101
*
1102
* During the processing of a single stream, the caller must call `Encode*`
1103
* zero or more times with `aLast` set to `false` and then call `Encode*` at
1104
* least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1105
* the processing of the stream has ended. Otherwise, the caller must call
1106
* `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1107
* i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1108
*
1109
* Once the stream has ended, the `Encoder` object must not be used anymore.
1110
* That is, you need to create another one to process another stream.
1111
*
1112
* When the encoder returns `kOutputFull` or the encoder returns an unmappable
1113
* result and the caller does not wish to treat it as a fatal error, the input
1114
* buffer `aSrc` may not have been completely consumed. In that case, the caller
1115
* must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1116
* call.
1117
*
1118
* # Infinite loops
1119
*
1120
* When converting with a fixed-size output buffer whose size is too small to
1121
* accommodate one character of output, an infinite loop ensues. When
1122
* converting with a fixed-size output buffer, it generally makes sense to
1123
* make the buffer fairly large (e.g. couple of kilobytes).
1124
*/
1125
class Encoder final {
1126
public:
1127
~Encoder() {}
1128
1129
static void operator delete(void* aEncoder) {
1130
encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1131
}
1132
1133
/**
1134
* The `Encoding` this `Encoder` is for.
1135
*/
1136
inline NotNull<const mozilla::Encoding*> Encoding() const {
1137
return WrapNotNull(encoder_encoding(this));
1138
}
1139
1140
/**
1141
* Returns `true` if this is an ISO-2022-JP encoder that's not in the
1142
* ASCII state and `false` otherwise.
1143
*/
1144
inline bool HasPendingState() const {
1145
return encoder_has_pending_state(this);
1146
}
1147
1148
/**
1149
* Query the worst-case output size when encoding from UTF-8 with
1150
* replacement.
1151
*
1152
* Returns the size of the output buffer in bytes that will not overflow
1153
* given the current state of the encoder and `aByteLength` number of
1154
* additional input code units if there are no unmappable characters in
1155
* the input.
1156
*/
1157
inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1158
size_t aByteLength) const {
1159
CheckedInt<size_t> max(
1160
encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1161
aByteLength));
1162
if (max.value() == MaxValue<size_t>::value) {
1163
// Mark invalid by overflowing
1164
max++;
1165
MOZ_ASSERT(!max.isValid());
1166
}
1167
return max;
1168
}
1169
1170
/**
1171
* Query the worst-case output size when encoding from UTF-8 without
1172
* replacement.
1173
*
1174
* Returns the size of the output buffer in bytes that will not overflow
1175
* given the current state of the encoder and `aByteLength` number of
1176
* additional input code units.
1177
*/
1178
inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1179
size_t aByteLength) const {
1180
CheckedInt<size_t> max(
1181
encoder_max_buffer_length_from_utf8_without_replacement(this,
1182
aByteLength));
1183
if (max.value() == MaxValue<size_t>::value) {
1184
// Mark invalid by overflowing
1185
max++;
1186
MOZ_ASSERT(!max.isValid());
1187
}
1188
return max;
1189
}
1190
1191
/**
1192
* Incrementally encode into byte stream from UTF-8 with unmappable
1193
* characters replaced with HTML (decimal) numeric character references.
1194
*
1195
* See the documentation of the class for documentation for `Encode*`
1196
* methods collectively.
1197
*
1198
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1199
* The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1200
* absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1201
*/
1202
inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1203
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1204
size_t srcRead = aSrc.Length();
1205
size_t dstWritten = aDst.Length();
1206
bool hadReplacements;
1207
uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1208
aDst.Elements(), &dstWritten,
1209
aLast, &hadReplacements);
1210
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1211
}
1212
1213
/**
1214
* Incrementally encode into byte stream from UTF-8 _without replacement_.
1215
*
1216
* See the documentation of the class for documentation for `Encode*`
1217
* methods collectively.
1218
*
1219
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1220
* The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1221
* absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1222
*/
1223
inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1224
Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1225
size_t srcRead = aSrc.Length();
1226
size_t dstWritten = aDst.Length();
1227
uint32_t result = encoder_encode_from_utf8_without_replacement(
1228
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1229
return MakeTuple(result, srcRead, dstWritten);
1230
}
1231
1232
/**
1233
* Query the worst-case output size when encoding from UTF-16 with
1234
* replacement.
1235
*
1236
* Returns the size of the output buffer in bytes that will not overflow
1237
* given the current state of the encoder and `aU16Length` number of
1238
* additional input code units if there are no unmappable characters in
1239
* the input.
1240
*/
1241
inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1242
size_t aU16Length) const {
1243
CheckedInt<size_t> max(
1244
encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1245
aU16Length));
1246
if (max.value() == MaxValue<size_t>::value) {
1247
// Mark invalid by overflowing
1248
max++;
1249
MOZ_ASSERT(!max.isValid());
1250
}
1251
return max;
1252
}
1253
1254
/**
1255
* Query the worst-case output size when encoding from UTF-16 without
1256
* replacement.
1257
*
1258
* Returns the size of the output buffer in bytes that will not overflow
1259
* given the current state of the encoder and `aU16Length` number of
1260
* additional input code units.
1261
*/
1262
inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1263
size_t aU16Length) const {
1264
CheckedInt<size_t> max(
1265
encoder_max_buffer_length_from_utf16_without_replacement(this,
1266
aU16Length));
1267
if (max.value() == MaxValue<size_t>::value) {
1268
// Mark invalid by overflowing
1269
max++;
1270
MOZ_ASSERT(!max.isValid());
1271
}
1272
return max;
1273
}
1274
1275
/**
1276
* Incrementally encode into byte stream from UTF-16 with unmappable
1277
* characters replaced with HTML (decimal) numeric character references.
1278
*
1279
* See the documentation of the class for documentation for `Encode*`
1280
* methods collectively.
1281
*/
1282
inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1283
Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1284
size_t srcRead = aSrc.Length();
1285
size_t dstWritten = aDst.Length();
1286
bool hadReplacements;
1287
uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1288
aDst.Elements(), &dstWritten,
1289
aLast, &hadReplacements);
1290
return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1291
}
1292
1293
/**
1294
* Incrementally encode into byte stream from UTF-16 _without replacement_.
1295
*
1296
* See the documentation of the class for documentation for `Encode*`
1297
* methods collectively.
1298
*/
1299
inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1300
Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1301
size_t srcRead = aSrc.Length();
1302
size_t dstWritten = aDst.Length();
1303
uint32_t result = encoder_encode_from_utf16_without_replacement(
1304
this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1305
return MakeTuple(result, srcRead, dstWritten);
1306
}
1307
1308
private:
1309
Encoder() = delete;
1310
Encoder(const Encoder&) = delete;
1311
Encoder& operator=(const Encoder&) = delete;
1312
};
1313
1314
}; // namespace mozilla
1315
1316
#endif // mozilla_Encoding_h