dct-inl.h - mozsearch

mozilla-central/third_party/jpeg-xl/lib/jpegli/dct-inl.h

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: ImageLib

Revision control

Copy as Markdown

Other Tools

// Copyright (c) the JPEG XL Project Authors. All rights reserved.

//

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

#if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)

#ifdef LIB_JPEGLI_DCT_INL_H_

#undef LIB_JPEGLI_DCT_INL_H_

#else

#define LIB_JPEGLI_DCT_INL_H_

#endif

#include "lib/jpegli/transpose-inl.h"

#include "lib/jxl/base/compiler_specific.h"

HWY_BEFORE_NAMESPACE();

namespace jpegli {

namespace HWY_NAMESPACE {

namespace {

// These templates are not found via ADL.

using hwy::HWY_NAMESPACE::Abs;

using hwy::HWY_NAMESPACE::Add;

using hwy::HWY_NAMESPACE::DemoteTo;

using hwy::HWY_NAMESPACE::Ge;

using hwy::HWY_NAMESPACE::IfThenElseZero;

using hwy::HWY_NAMESPACE::Mul;

using hwy::HWY_NAMESPACE::MulAdd;

using hwy::HWY_NAMESPACE::Rebind;

using hwy::HWY_NAMESPACE::Round;

using hwy::HWY_NAMESPACE::Sub;

using hwy::HWY_NAMESPACE::Vec;

using D = HWY_FULL(float);

using DI = HWY_FULL(int32_t);

template <size_t N>

void AddReverse(const float* JXL_RESTRICT a_in1,

                const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {

  HWY_CAPPED(float, 8) d8;

  for (size_t i = 0; i < N; i++) {

    auto in1 = Load(d8, a_in1 + i * 8);

    auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);

    Store(Add(in1, in2), d8, a_out + i * 8);

template <size_t N>

void SubReverse(const float* JXL_RESTRICT a_in1,

                const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {

  HWY_CAPPED(float, 8) d8;

  for (size_t i = 0; i < N; i++) {

    auto in1 = Load(d8, a_in1 + i * 8);

    auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);

    Store(Sub(in1, in2), d8, a_out + i * 8);

template <size_t N>

void B(float* JXL_RESTRICT coeff) {

  HWY_CAPPED(float, 8) d8;

  constexpr float kSqrt2 = 1.41421356237f;

  auto sqrt2 = Set(d8, kSqrt2);

  auto in1 = Load(d8, coeff);

  auto in2 = Load(d8, coeff + 8);

  Store(MulAdd(in1, sqrt2, in2), d8, coeff);

  for (size_t i = 1; i + 1 < N; i++) {

    auto in1 = Load(d8, coeff + i * 8);

    auto in2 = Load(d8, coeff + (i + 1) * 8);

    Store(Add(in1, in2), d8, coeff + i * 8);

// Ideally optimized away by compiler (except the multiply).

template <size_t N>

void InverseEvenOdd(const float* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) {

  HWY_CAPPED(float, 8) d8;

  for (size_t i = 0; i < N / 2; i++) {

    auto in1 = Load(d8, a_in + i * 8);

    Store(in1, d8, a_out + 2 * i * 8);

  for (size_t i = N / 2; i < N; i++) {

    auto in1 = Load(d8, a_in + i * 8);

    Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8);

// Constants for DCT implementation. Generated by the following snippet:

// for i in range(N // 2):

//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")

template <size_t N>

struct WcMultipliers;

template <>

struct WcMultipliers<4> {

  static constexpr float kMultipliers[] = {

      0.541196100146197,

      1.3065629648763764,

};

};

template <>

struct WcMultipliers<8> {

  static constexpr float kMultipliers[] = {

      0.5097955791041592,

      0.6013448869350453,

      0.8999762231364156,

      2.5629154477415055,

};

};

#if JXL_CXX_LANG < JXL_CXX_17

constexpr float WcMultipliers<4>::kMultipliers[];

constexpr float WcMultipliers<8>::kMultipliers[];

#endif

// Invoked on full vector.

template <size_t N>

void Multiply(float* JXL_RESTRICT coeff) {

  HWY_CAPPED(float, 8) d8;

  for (size_t i = 0; i < N / 2; i++) {

    auto in1 = Load(d8, coeff + (N / 2 + i) * 8);

    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);

    Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);

void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,

                   size_t off, float* JXL_RESTRICT coeff) {

  HWY_CAPPED(float, 8) d8;

  for (size_t i = 0; i < 8; i++) {

    Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);

void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,

                          size_t off) {

  HWY_CAPPED(float, 8) d8;

  auto mul = Set(d8, 1.0f / 8);

  for (size_t i = 0; i < 8; i++) {

    StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);

template <size_t N>

struct DCT1DImpl;

template <>

struct DCT1DImpl<1> {

  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}

};

template <>

struct DCT1DImpl<2> {

  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {

    HWY_CAPPED(float, 8) d8;

    auto in1 = Load(d8, mem);

    auto in2 = Load(d8, mem + 8);

    Store(Add(in1, in2), d8, mem);

    Store(Sub(in1, in2), d8, mem + 8);

};

template <size_t N>

struct DCT1DImpl {

  void operator()(float* JXL_RESTRICT mem) {

    HWY_ALIGN float tmp[N * 8];

    AddReverse<N / 2>(mem, mem + N * 4, tmp);

    DCT1DImpl<N / 2>()(tmp);

    SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);

    Multiply<N>(tmp);

    DCT1DImpl<N / 2>()(tmp + N * 4);

    B<N / 2>(tmp + N * 4);

    InverseEvenOdd<N>(tmp, mem);

};

void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,

           float* JXL_RESTRICT output) {

  HWY_CAPPED(float, 8) d8;

  HWY_ALIGN float tmp[64];

  for (size_t i = 0; i < 8; i += Lanes(d8)) {

    // TODO(veluca): consider removing the temporary memory here (as is done in

    // IDCT), if it turns out that some compilers don't optimize away the loads

    // and this is performance-critical.

    LoadFromBlock(pixels, pixels_stride, i, tmp);

    DCT1DImpl<8>()(tmp);

    StoreToBlockAndScale(tmp, output, i);

JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(

    const float* JXL_RESTRICT pixels, size_t pixels_stride,

    float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {

  DCT1D(pixels, pixels_stride, scratch_space);

  Transpose8x8Block(scratch_space, coefficients);

  DCT1D(coefficients, 8, scratch_space);

  Transpose8x8Block(scratch_space, coefficients);

JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,

                                                     int16_t* out) {

  Rebind<int16_t, DI> di16;

  Store(DemoteTo(di16, ival), di16, out);

JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,

                                                     int32_t* out) {

  DI di;

  Store(ival, di, out);

template <typename T>

void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,

                   const float* zero_bias_offset, const float* zero_bias_mul,

                   T* block) {

  D d;

  DI di;

  const auto aq_mul = Set(d, aq_strength);

  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {

    const auto val = Load(d, dct + k);

    const auto q = Load(d, qmc + k);

    const auto qval = Mul(val, q);

    const auto zb_offset = Load(d, zero_bias_offset + k);

    const auto zb_mul = Load(d, zero_bias_mul + k);

    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));

    const auto nzero_mask = Ge(Abs(qval), threshold);

    const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));

    StoreQuantizedValue(ival, block + k);

template <typename T>

void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,

                             const float* JXL_RESTRICT qmc,

                             int16_t last_dc_coeff, float aq_strength,

                             const float* zero_bias_offset,

                             const float* zero_bias_mul,

                             float* JXL_RESTRICT tmp, T* block) {

  float* JXL_RESTRICT dct = tmp;

  float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;

  TransformFromPixels(pixels, stride, dct, scratch_space);

  QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);

  // Center DC values around zero.

  static constexpr float kDCBias = 128.0f;

  const float dc = (dct[0] - kDCBias) * qmc[0];

  float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];

  if (std::abs(dc - last_dc_coeff) < dc_threshold) {

    block[0] = last_dc_coeff;

  } else {

    block[0] = std::round(dc);

// NOLINTNEXTLINE(google-readability-namespace-comments)

}  // namespace

}  // namespace HWY_NAMESPACE

}  // namespace jpegli

HWY_AFTER_NAMESPACE();

#endif  // LIB_JPEGLI_DCT_INL_H_