camellia_hwaes.cpp

comm-central/third_party/botan/src/lib/block/camellia/camellia_hwaes/camellia_hwaes.cpp

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

/*

* (C) 2026 Jack Lloyd

* Botan is released under the Simplified BSD License (see license.txt)

*/

#include <botan/internal/camellia.h>

#include <botan/mem_ops.h>

#include <botan/internal/simd_hwaes.h>

namespace Botan {

namespace Camellia_HWAES {

namespace {

/* Helpers for 64-bit operations on SIMD_4x32 */

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 load_be64(const uint8_t* in) {

   const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);

   return SIMD_4x32::byte_shuffle(SIMD_4x32::load_le(in), bswap64);

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void store_be64(uint8_t* out, SIMD_4x32 v) {

   const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);

   SIMD_4x32::byte_shuffle(v, bswap64).store_le(out);

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 splat64(uint64_t v) {

   const uint32_t lo = static_cast<uint32_t>(v);

   const uint32_t hi = static_cast<uint32_t>(v >> 32);

   return SIMD_4x32(lo, hi, lo, hi);

/* The Camellia round function */

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 camellia_f(SIMD_4x32 x) {

   // Pre-affine shared by S1/S2/S3

   constexpr uint64_t pre123_a = gfni_matrix(R"(

      1 1 1 0 1 1 0 1

      0 0 1 1 0 0 1 0

      1 1 0 1 0 0 0 0

      1 0 1 1 0 0 1 1

      0 0 0 0 1 1 0 0

      1 0 1 0 0 1 0 0

      0 0 1 0 1 1 0 0

      1 0 0 0 0 1 1 0)");

   // Pre-affine for S4

   constexpr uint64_t pre4_a = gfni_matrix(R"(

      1 1 0 1 1 0 1 1

      0 1 1 0 0 1 0 0

      1 0 1 0 0 0 0 1

      0 1 1 0 0 1 1 1

      0 0 0 1 1 0 0 0

      0 1 0 0 1 0 0 1

      0 1 0 1 1 0 0 0

      0 0 0 0 1 1 0 1)");

   constexpr uint8_t pre_c = 0x45;

   // Post-affine for S1 and S4

   constexpr uint64_t post14_a = gfni_matrix(R"(

      0 0 0 0 0 0 0 1

      0 1 1 0 0 1 1 0

      1 0 1 1 1 1 1 0

      0 0 0 1 1 0 1 1

      1 0 0 0 1 1 1 0

      0 1 0 1 1 1 1 0

      0 1 1 1 1 1 1 1

      0 0 0 1 1 1 0 0)");

   constexpr uint8_t post14_c = 0x6E;

   // Post-affine for S2

   constexpr uint64_t post2_a = gfni_matrix(R"(

      0 0 0 1 1 1 0 0

      0 0 0 0 0 0 0 1

      0 1 1 0 0 1 1 0

      1 0 1 1 1 1 1 0

      0 0 0 1 1 0 1 1

      1 0 0 0 1 1 1 0

      0 1 0 1 1 1 1 0

      0 1 1 1 1 1 1 1)");

   constexpr uint8_t post2_c = 0xDC;

   // Post-affine for S3

   constexpr uint64_t post3_a = gfni_matrix(R"(

      0 1 1 0 0 1 1 0

      1 0 1 1 1 1 1 0

      0 0 0 1 1 0 1 1

      1 0 0 0 1 1 1 0

      0 1 0 1 1 1 1 0

      0 1 1 1 1 1 1 1

      0 0 0 1 1 1 0 0

      0 0 0 0 0 0 0 1)");

   constexpr uint8_t post3_c = 0x37;

   constexpr auto PRE123 = Gf2AffineTransformation(pre123_a, pre_c);

   constexpr auto PRE4 = Gf2AffineTransformation(pre4_a, pre_c);

   constexpr auto POST14 = Gf2AffineTransformation::post_sbox(post14_a, post14_c);

   constexpr auto POST2 = Gf2AffineTransformation::post_sbox(post2_a, post2_c);

   constexpr auto POST3 = Gf2AffineTransformation::post_sbox(post3_a, post3_c);

   const auto mask_s2 = SIMD_4x32(0xFF000000, 0x00FF0000, 0xFF000000, 0x00FF0000);

   const auto mask_s3 = SIMD_4x32(0x00FF0000, 0x0000FF00, 0x00FF0000, 0x0000FF00);

   const auto mask_s4 = SIMD_4x32(0x0000FF00, 0x000000FF, 0x0000FF00, 0x000000FF);

   const auto pre123 = PRE123.affine_transform(x);

   const auto pre4 = PRE4.affine_transform(x);

   const auto sub = hw_aes_sbox(SIMD_4x32::byte_blend(mask_s4, pre4, pre123));

   const auto s14 = POST14.affine_transform(sub);

   const auto s2 = POST2.affine_transform(sub);

   const auto s3 = POST3.affine_transform(sub);

   // Final merged Sbox output for all bytes

   const auto sbox = SIMD_4x32::byte_blend(mask_s3, s3, SIMD_4x32::byte_blend(mask_s2, s2, s14));

   // The linear mixing step

   const auto P1 = SIMD_4x32(0x00000001, 0x00000001, 0x08080809, 0x08080809);

   const auto P2 = SIMD_4x32(0x01010202, 0x01010202, 0x09090A0A, 0x09090A0A);

   const auto P3 = SIMD_4x32(0x02030303, 0x02030303, 0x0A0B0B0B, 0x0A0B0B0B);

   const auto P4 = SIMD_4x32(0x06050404, 0x04040504, 0x0E0D0C0C, 0x0C0C0D0C);

   const auto P5 = SIMD_4x32(0x07060507, 0x05060605, 0x0F0E0D0F, 0x0D0E0E0D);

   const auto P6 = SIMD_4x32(0xFFFFFFFF, 0x07070706, 0xFFFFFFFF, 0x0F0F0F0E);

   const auto sxp1 = SIMD_4x32::byte_shuffle(sbox, P1);

   const auto sxp2 = SIMD_4x32::byte_shuffle(sbox, P2);

   const auto sxp3 = SIMD_4x32::byte_shuffle(sbox, P3);

   const auto sxp4 = SIMD_4x32::byte_shuffle(sbox, P4);

   const auto sxp5 = SIMD_4x32::byte_shuffle(sbox, P5);

   const auto sxp6 = SIMD_4x32::byte_shuffle(sbox, P6);

   return (sxp1 ^ sxp2 ^ sxp3 ^ sxp4 ^ sxp5 ^ sxp6);

/*

* FL and FL-inverse operate on 32-bit sub-halves within each 64-bit element.

* We use byte_shuffle to broadcast each 32-bit half, then recombine with byte_blend.

*/

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FL_2(SIMD_4x32 v, uint64_t K) {

   const uint32_t k1 = static_cast<uint32_t>(K >> 32);

   const uint32_t k2 = static_cast<uint32_t>(K);

   // Broadcast upper/lower 32-bit halves of each 64-bit element

   const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);

   const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);

   auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);

   auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);

   x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();

   x1 ^= x2 | SIMD_4x32::splat(k2);

   // Recombine: lo from x2, hi from x1

   const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);

   return SIMD_4x32::byte_blend(mask_hi, x1, x2);

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FLINV_2(SIMD_4x32 v, uint64_t K) {

   const uint32_t k1 = static_cast<uint32_t>(K >> 32);

   const uint32_t k2 = static_cast<uint32_t>(K);

   const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);

   const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);

   auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);

   auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);

   x1 ^= x2 | SIMD_4x32::splat(k2);

   x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();

   const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);

   return SIMD_4x32::byte_blend(mask_hi, x1, x2);

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void load_and_deinterleave(const uint8_t in[], SIMD_4x32& L, SIMD_4x32& R) {

   auto A = load_be64(in);       // block 0: [L0, R0]

   auto B = load_be64(in + 16);  // block 1: [L1, R1]

   const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);

   L = SIMD_4x32::byte_blend(mask_upper, B.swap_halves(), A);  // [L0, L1]

   R = SIMD_4x32::byte_blend(mask_upper, B, A.swap_halves());  // [R0, R1]

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void interleave_and_store(uint8_t out[], SIMD_4x32 L, SIMD_4x32 R) {

   // Camellia output swaps L and R

   const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);

   auto A = SIMD_4x32::byte_blend(mask_upper, L.swap_halves(), R);  // [R0, L0]

   auto B = SIMD_4x32::byte_blend(mask_upper, L, R.swap_halves());  // [R1, L1]

   store_be64(out, A);

   store_be64(out + 16, B);

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_e_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {

   R ^= camellia_f(L ^ splat64(SK[0]));

   L ^= camellia_f(R ^ splat64(SK[1]));

   R ^= camellia_f(L ^ splat64(SK[2]));

   L ^= camellia_f(R ^ splat64(SK[3]));

   R ^= camellia_f(L ^ splat64(SK[4]));

   L ^= camellia_f(R ^ splat64(SK[5]));

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_d_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {

   R ^= camellia_f(L ^ splat64(SK[5]));

   L ^= camellia_f(R ^ splat64(SK[4]));

   R ^= camellia_f(L ^ splat64(SK[3]));

   L ^= camellia_f(R ^ splat64(SK[2]));

   R ^= camellia_f(L ^ splat64(SK[1]));

   L ^= camellia_f(R ^ splat64(SK[0]));

BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {

   SIMD_4x32 L;

   SIMD_4x32 R;

   load_and_deinterleave(in, L, R);

   L ^= splat64(SK[0]);

   R ^= splat64(SK[1]);

   six_e_rounds(L, R, &SK[2]);

   L = FL_2(L, SK[8]);

   R = FLINV_2(R, SK[9]);

   six_e_rounds(L, R, &SK[10]);

   L = FL_2(L, SK[16]);

   R = FLINV_2(R, SK[17]);

   six_e_rounds(L, R, &SK[18]);

   R ^= splat64(SK[24]);

   L ^= splat64(SK[25]);

   interleave_and_store(out, L, R);

BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {

   SIMD_4x32 L;

   SIMD_4x32 R;

   load_and_deinterleave(in, L, R);

   R ^= splat64(SK[25]);

   L ^= splat64(SK[24]);

   six_d_rounds(L, R, &SK[18]);

   L = FL_2(L, SK[17]);

   R = FLINV_2(R, SK[16]);

   six_d_rounds(L, R, &SK[10]);

   L = FL_2(L, SK[9]);

   R = FLINV_2(R, SK[8]);

   six_d_rounds(L, R, &SK[2]);

   L ^= splat64(SK[1]);

   R ^= splat64(SK[0]);

   interleave_and_store(out, L, R);

BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {

   SIMD_4x32 L;

   SIMD_4x32 R;

   load_and_deinterleave(in, L, R);

   L ^= splat64(SK[0]);

   R ^= splat64(SK[1]);

   six_e_rounds(L, R, &SK[2]);

   L = FL_2(L, SK[8]);

   R = FLINV_2(R, SK[9]);

   six_e_rounds(L, R, &SK[10]);

   L = FL_2(L, SK[16]);

   R = FLINV_2(R, SK[17]);

   six_e_rounds(L, R, &SK[18]);

   L = FL_2(L, SK[24]);

   R = FLINV_2(R, SK[25]);

   six_e_rounds(L, R, &SK[26]);

   R ^= splat64(SK[32]);

   L ^= splat64(SK[33]);

   interleave_and_store(out, L, R);

BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {

   SIMD_4x32 L;

   SIMD_4x32 R;

   load_and_deinterleave(in, L, R);

   R ^= splat64(SK[33]);

   L ^= splat64(SK[32]);

   six_d_rounds(L, R, &SK[26]);

   L = FL_2(L, SK[25]);

   R = FLINV_2(R, SK[24]);

   six_d_rounds(L, R, &SK[18]);

   L = FL_2(L, SK[17]);

   R = FLINV_2(R, SK[16]);

   six_d_rounds(L, R, &SK[10]);

   L = FL_2(L, SK[9]);

   R = FLINV_2(R, SK[8]);

   six_d_rounds(L, R, &SK[2]);

   L ^= splat64(SK[1]);

   R ^= splat64(SK[0]);

   interleave_and_store(out, L, R);

}  // namespace

}  // namespace Camellia_HWAES

// static

void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_encrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_encrypt_x2_18r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_encrypt_x2_18r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

// static

void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_decrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_decrypt_x2_18r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_decrypt_x2_18r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

// static

void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_encrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

// static

void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_decrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

// static

void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_encrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

// static

void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_decrypt(const uint8_t in[],

                                                    uint8_t out[],

                                                    size_t blocks,

                                                    std::span<const uint64_t> SK) {

   while(blocks >= 2) {

      Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);

      in += 2 * 16;

      out += 2 * 16;

      blocks -= 2;

   if(blocks > 0) {

      uint8_t ibuf[2 * 16] = {0};

      uint8_t obuf[2 * 16] = {0};

      copy_mem(ibuf, in, 16);

      Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);

      copy_mem(out, obuf, 16);

}  // namespace Botan