demote_test.cc - mozsearch

mozilla-central/third_party/highway/hwy/tests/demote_test.cc

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: ImageLib

Revision control

Copy as Markdown

Other Tools

// Copyright 2019 Google LLC

// SPDX-License-Identifier: Apache-2.0

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#include <stddef.h>

#include <stdint.h>

#include <cmath>  // std::isfinite

#undef HWY_TARGET_INCLUDE

#define HWY_TARGET_INCLUDE "tests/demote_test.cc"

#include "hwy/foreach_target.h"  // IWYU pragma: keep

#include "hwy/highway.h"

#include "hwy/tests/test_util-inl.h"

// Causes build timeout.

#if !HWY_IS_MSAN

HWY_BEFORE_NAMESPACE();

namespace hwy {

namespace HWY_NAMESPACE {

template <typename ToT>

struct TestDemoteTo {

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {

    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");

    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");

    const Rebind<ToT, D> to_d;

    const size_t N = Lanes(from_d);

    auto from = AllocateAligned<T>(N);

    auto expected = AllocateAligned<ToT>(N);

    HWY_ASSERT(from && expected);

    // Narrower range in the wider type, for clamping before we cast

    const T min = ConvertScalarTo<T>(IsSigned<T>() ? LimitsMin<ToT>()

                                                   : static_cast<ToT>(0));

    const T max = LimitsMax<ToT>();

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < N; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size

        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));

      const auto in = Load(from_d, from.get());

      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < N; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(ToT)>(&bits, &expected[i]);  // not same size

        if (!IsSigned<T>() && IsSigned<ToT>()) {

          expected[i] &= static_cast<ToT>(max);

        from[i] = ConvertScalarTo<T>(expected[i]);

      const auto in = Load(from_d, from.get());

      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));

};

HWY_NOINLINE void TestAllDemoteToInt() {

  const ForDemoteVectors<TestDemoteTo<uint8_t>> from_i16_to_u8;

  from_i16_to_u8(int16_t());

  from_i16_to_u8(uint16_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>> from_i16_to_i8;

  from_i16_to_i8(int16_t());

  from_i16_to_i8(uint16_t());

  const ForDemoteVectors<TestDemoteTo<uint8_t>, 2> from_i32_to_u8;

  from_i32_to_u8(int32_t());

  from_i32_to_u8(uint32_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>, 2> from_i32_to_i8;

  from_i32_to_i8(int32_t());

  from_i32_to_i8(uint32_t());

#if HWY_HAVE_INTEGER64

  const ForDemoteVectors<TestDemoteTo<uint8_t>, 3> from_i64_to_u8;

  from_i64_to_u8(int64_t());

  from_i64_to_u8(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>, 3> from_i64_to_i8;

  from_i64_to_i8(int64_t());

  from_i64_to_i8(uint64_t());

#endif

  const ForDemoteVectors<TestDemoteTo<uint16_t>> from_i32_to_u16;

  from_i32_to_u16(int32_t());

  from_i32_to_u16(uint32_t());

  const ForDemoteVectors<TestDemoteTo<int16_t>> from_i32_to_i16;

  from_i32_to_i16(int32_t());

  from_i32_to_i16(uint32_t());

#if HWY_HAVE_INTEGER64

  const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> from_i64_to_u16;

  from_i64_to_u16(int64_t());

  from_i64_to_u16(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int16_t>, 2> from_i64_to_i16;

  from_i64_to_i16(int64_t());

  from_i64_to_i16(uint64_t());

  const ForDemoteVectors<TestDemoteTo<uint32_t>> from_i64_to_u32;

  from_i64_to_u32(int64_t());

  from_i64_to_u32(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int32_t>> from_i64_to_i32;

  from_i64_to_i32(int64_t());

  from_i64_to_i32(uint64_t());

#endif

HWY_NOINLINE void TestAllDemoteToMixed() {

#if HWY_HAVE_FLOAT64

  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;

  to_i32(double());

  const ForDemoteVectors<TestDemoteTo<uint32_t>> to_u32;

  to_u32(double());

#endif

template <typename ToT>

struct TestDemoteToFloat {

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {

    // For floats, we clamp differently and cannot call LimitsMin.

    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");

    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");

    const Rebind<ToT, D> to_d;

    const size_t N = Lanes(from_d);

    auto from = AllocateAligned<T>(N);

    auto expected = AllocateAligned<ToT>(N);

    HWY_ASSERT(from && expected);

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < N; ++i) {

        from[i] = RandomFiniteValue<T>(&rng);

        const T magn = std::abs(from[i]);

        const T max_abs = HighestValue<ToT>();

        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see

        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html

        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);

        expected[i] = static_cast<ToT>(clipped);

      HWY_ASSERT_VEC_EQ(to_d, expected.get(),

                        DemoteTo(to_d, Load(from_d, from.get())));

};

HWY_NOINLINE void TestAllDemoteToFloat() {

  // Must test f16 separately because we can only load/store/convert them.

#if HWY_HAVE_FLOAT64

  const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;

  to_float(double());

#endif

struct TestDemoteUI64ToFloat {

  // This helper function avoids an internal compiler error on GCC 8 AVX3,

  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111117.

  template <class D>

  static HWY_NOINLINE void Verify(D from_d, TFromD<D> from, float expected) {

    const Rebind<float, D> df32;

    HWY_ASSERT_VEC_EQ(df32, Set(df32, expected),

                      DemoteTo(df32, Set(from_d, from)));

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {

    const Rebind<float, D> df32;

    Verify(from_d, static_cast<T>(0), 0.0f);

    Verify(from_d, LimitsMax<T>(), static_cast<float>(LimitsMax<T>()));

    Verify(from_d, static_cast<T>(11808), 11808.0f);

    Verify(from_d, static_cast<T>(261162016), 261162016.0f);

    Verify(from_d, static_cast<T>(18665497952256LL), 18665497952256.0f);

    if (IsSigned<T>()) {

      Verify(from_d, static_cast<T>(-1), -1.0f);

      Verify(from_d, LimitsMin<T>(), static_cast<float>(LimitsMin<T>()));

      Verify(from_d, static_cast<T>(-17633), -17633.0f);

      Verify(from_d, static_cast<T>(-3888877568LL), -3888877568.0f);

      Verify(from_d, static_cast<T>(-17851503083520LL), -17851503083520.0f);

    const size_t N = Lanes(from_d);

    auto from = AllocateAligned<T>(N);

    auto expected = AllocateAligned<float>(N);

    HWY_ASSERT(from && expected);

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < N; i++) {

        const uint64_t bits = rng();

        CopySameSize(&bits, &from[i]);

        expected[i] = static_cast<float>(from[i]);

      HWY_ASSERT_VEC_EQ(df32, expected.get(),

                        DemoteTo(df32, Load(from_d, from.get())));

};

HWY_NOINLINE void TestAllDemoteUI64ToFloat() {

#if HWY_HAVE_INTEGER64

  const ForDemoteVectors<TestDemoteUI64ToFloat, 1> to_float;

  to_float(int64_t());

  to_float(uint64_t());

#endif

struct TestDemoteToBF16 {

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {

    // For floats, we clamp differently and cannot call LimitsMin.

    static_assert(IsSame<T, float>(),

                  "TestDemoteToBF16 can only be called if T is float");

    const Rebind<bfloat16_t, D> to_d;

    const Rebind<uint32_t, D> du32;

    const Rebind<uint16_t, D> du16;

    const size_t N = Lanes(from_d);

    auto from = AllocateAligned<T>(N);

    auto expected = AllocateAligned<bfloat16_t>(N);

    HWY_ASSERT(from && expected);

    const auto u16_zero_vect = Zero(du16);

    const auto u16_one_vect = Set(du16, 1);

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < N; ++i) {

        from[i] = RandomFiniteValue<T>(&rng);

        uint32_t fromBits;

        CopyBytes<sizeof(uint32_t)>(&from[i], &fromBits);

        uint16_t bf16Bits = static_cast<uint16_t>(fromBits >> 16);

        CopyBytes<sizeof(uint16_t)>(&bf16Bits, &expected[i]);

      const auto in = Load(from_d, from.get());

      const auto actual = DemoteTo(to_d, in);

      // Adjust expected to account for any possible rounding that was

      // carried out by the DemoteTo operation

      auto expected_vect = BitCast(du16, Load(to_d, expected.get()));

      const auto low_f32_bits = TruncateTo(du16, BitCast(du32, in));

      // max_diff_from_expected is equal to (low_f32_bits == 0 ? 0 : 1)

      const auto max_diff_from_expected =

          Add(VecFromMask(du16, Eq(low_f32_bits, u16_zero_vect)), u16_one_vect);

      // expected_adj is equal to (actual_bits - expected_bits == 1 &&

      // max_diff_from_expected != 0) ? 1 : 0, where actual_bits is the bits of

      // actual and expected_bits is the bits of expected.

      auto expected_adj =

          And(max_diff_from_expected,

              VecFromMask(du16, Eq(Sub(BitCast(du16, actual), expected_vect),

                                   u16_one_vect)));

      // Increment expected_vect by expected_adj

      expected_vect = Add(expected_vect, expected_adj);

      // Store the adjusted expected_vect back into expected

      Store(BitCast(to_d, expected_vect), to_d, expected.get());

      HWY_ASSERT_VEC_EQ(to_d, expected.get(), actual);

};

HWY_NOINLINE void TestAllDemoteToBF16() {

  const ForDemoteVectors<TestDemoteToBF16, 1> to_bf16;

  to_bf16(float());

template <class D>

AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {

  const float test_cases[] = {

      // Same as BF16TestCases:

      // +/- 1

      1.0f,

      -1.0f,

      // +/- 0

      0.0f,

      -0.0f,

      // near 0

      0.25f,

      -0.25f,

      // +/- integer

      4.0f,

      -32.0f,

      // positive +/- delta

      2.015625f,

      3.984375f,

      // negative +/- delta

      -2.015625f,

      -3.984375f,

      // No huge values - would interfere with sum. But add more to fill 2 * N:

      -2.0f,

      -10.0f,

      0.03125f,

      1.03125f,

      1.5f,

      2.0f,

      4.0f,

      5.0f,

      6.0f,

      8.0f,

      10.0f,

      256.0f,

      448.0f,

      2080.0f,

};

  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);

  const size_t N = Lanes(d);

  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors

  auto in = AllocateAligned<float>(padded);

  auto expected = AllocateAligned<float>(padded);

  HWY_ASSERT(in && expected);

  CopyBytes(test_cases, in.get(), kNumTestCases * sizeof(float));

  ZeroBytes(in.get() + kNumTestCases, (padded - kNumTestCases) * sizeof(float));

  return in;

class TestReorderDemote2To {

  // In-place N^2 selection sort to avoid dependencies

  void Sort(float* p, size_t count) {

    for (size_t i = 0; i < count - 1; ++i) {

      // Find min_element

      size_t idx_min = i;

      for (size_t j = i + 1; j < count; j++) {

        if (p[j] < p[idx_min]) {

          idx_min = j;

      // Swap with current

      const float tmp = p[i];

      p[i] = p[idx_min];

      p[idx_min] = tmp;

 public:

  template <typename TF32, class DF32>

  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {

#if HWY_TARGET != HWY_SCALAR

    size_t padded;

    auto in = ReorderBF16TestCases(d32, padded);

    using TBF16 = bfloat16_t;

    const Repartition<TBF16, DF32> dbf16;

    const Half<decltype(dbf16)> dbf16_half;

    const size_t N = Lanes(d32);

    auto temp16 = AllocateAligned<TBF16>(2 * N);

    auto expected = AllocateAligned<float>(2 * N);

    auto actual = AllocateAligned<float>(2 * N);

    HWY_ASSERT(temp16 && expected && actual);

    for (size_t i = 0; i < padded; i += 2 * N) {

      const auto f0 = Load(d32, &in[i + 0]);

      const auto f1 = Load(d32, &in[i + N]);

      const auto v16 = ReorderDemote2To(dbf16, f0, f1);

      Store(v16, dbf16, temp16.get());

      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));

      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));

      // Smoke test: sum should be same (with tolerance for non-associativity)

      const auto sum_expected = ReduceSum(d32, Add(f0, f1));

      const auto sum_actual = ReduceSum(d32, Add(promoted0, promoted1));

      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&

                 sum_actual <= sum_expected + 1E-4);

      // Ensure values are the same after sorting to undo the Reorder

      Store(f0, d32, expected.get() + 0);

      Store(f1, d32, expected.get() + N);

      Store(promoted0, d32, actual.get() + 0);

      Store(promoted1, d32, actual.get() + N);

      Sort(expected.get(), 2 * N);

      Sort(actual.get(), 2 * N);

      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));

      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));

#else  // HWY_SCALAR

    (void)d32;

#endif

};

class TestIntegerReorderDemote2To {

#if HWY_TARGET != HWY_SCALAR

 private:

  // In-place N^2 selection sort to avoid dependencies

  template <class T>

  static void Sort(T* p, size_t count) {

    for (size_t i = 0; i < count - 1; ++i) {

      // Find min_element

      size_t idx_min = i;

      for (size_t j = i + 1; j < count; j++) {

        if (p[j] < p[idx_min]) {

          idx_min = j;

      // Swap with current

      const T tmp = p[i];

      p[i] = p[idx_min];

      p[idx_min] = tmp;

  template <class T, class D, class DN>

  static void DoIntegerReorderDemote2ToTest(DN dn, T /* t */, D d) {

    using TN = TFromD<DN>;

    const size_t N = Lanes(d);

    const size_t twiceN = N * 2;

    auto from = AllocateAligned<T>(twiceN);

    auto expected = AllocateAligned<TN>(twiceN);

    auto actual = AllocateAligned<TN>(twiceN);

    HWY_ASSERT(from && expected && actual);

    // Narrower range in the wider type, for clamping before we cast

    const T min = ConvertScalarTo<T>(IsSigned<T>() ? LimitsMin<TN>() : TN{0});

    const T max = LimitsMax<TN>();

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < twiceN; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size

        expected[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, from[i]), max));

      const auto in_1 = Load(d, from.get());

      const auto in_2 = Load(d, from.get() + N);

      const auto demoted_vect = ReorderDemote2To(dn, in_1, in_2);

      Store(demoted_vect, dn, actual.get());

      Sort(actual.get(), twiceN);

      Sort(expected.get(), twiceN);

      HWY_ASSERT_VEC_EQ(dn, expected.get(), Load(dn, actual.get()));

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < twiceN; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(TN)>(&bits, &expected[i]);  // not same size

        if (!IsSigned<T>() && IsSigned<TN>()) {

          expected[i] &= static_cast<TN>(max);

        from[i] = ConvertScalarTo<T>(expected[i]);

      const auto in_1 = Load(d, from.get());

      const auto in_2 = Load(d, from.get() + N);

      const auto demoted_vect = ReorderDemote2To(dn, in_1, in_2);

      Store(demoted_vect, dn, actual.get());

      Sort(actual.get(), twiceN);

      Sort(expected.get(), twiceN);

      HWY_ASSERT_VEC_EQ(dn, expected.get(), Load(dn, actual.get()));

#endif

 public:

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*t*/, D d) {

#if HWY_TARGET != HWY_SCALAR

    const RepartitionToNarrow<D> dn;

    const RebindToSigned<decltype(dn)> dn_i;

    const RebindToUnsigned<decltype(dn)> dn_u;

    DoIntegerReorderDemote2ToTest(dn_i, T(), d);

    DoIntegerReorderDemote2ToTest(dn_u, T(), d);

#else

    (void)d;

#endif

};

HWY_NOINLINE void TestAllReorderDemote2To() {

  ForUI163264(ForShrinkableVectors<TestIntegerReorderDemote2To>());

  ForShrinkableVectors<TestReorderDemote2To>()(float());

struct TestFloatOrderedDemote2To {

  template <typename TN, class DN>

  HWY_NOINLINE void operator()(TN /*t*/, DN dn) {

#if HWY_TARGET != HWY_SCALAR

    const RepartitionToWide<decltype(dn)> df;

    using TF = TFromD<decltype(df)>;

    const RebindToUnsigned<decltype(dn)> du16;

    const RebindToUnsigned<decltype(df)> du32;

    const Half<decltype(du16)> du16_half;

    const size_t N = Lanes(df);

    const size_t twiceN = N * 2;

    auto from = AllocateAligned<TF>(twiceN);

    auto expected = AllocateAligned<TN>(twiceN);

    HWY_ASSERT(from && expected);

    const auto u16_zero_vect = Zero(du16);

    const auto u16_one_vect = Set(du16, 1);

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < twiceN; ++i) {

        from[i] = RandomFiniteValue<TF>(&rng);

        uint32_t u32Bits;

        CopyBytes<sizeof(uint32_t)>(&from[i], &u32Bits);

        const uint16_t expected_bf16_bits =

            static_cast<uint16_t>(u32Bits >> 16);

        CopyBytes<sizeof(TN)>(&expected_bf16_bits, &expected[i]);

      const auto in_1 = Load(df, from.get());

      const auto in_2 = Load(df, from.get() + N);

      const auto actual = OrderedDemote2To(dn, in_1, in_2);

      // Adjust expected to account for any possible rounding that was

      // carried out by the OrderedDemote2To operation

      auto expected_vect = BitCast(du16, Load(dn, expected.get()));

      const auto low_f32_bits =

          Combine(du16, TruncateTo(du16_half, BitCast(du32, in_2)),

                  TruncateTo(du16_half, BitCast(du32, in_1)));

      // max_diff_from_expected is equal to (low_f32_bits == 0 ? 0 : 1)

      const auto max_diff_from_expected =

          Add(VecFromMask(du16, Eq(low_f32_bits, u16_zero_vect)), u16_one_vect);

      // expected_adj is equal to (actual_bits - expected_bits == 1 &&

      // max_diff_from_expected != 0) ? 1 : 0, where actual_bits is the bits of

      // actual and expected_bits is the bits of expected.

      auto expected_adj =

          And(max_diff_from_expected,

              VecFromMask(du16, Eq(Sub(BitCast(du16, actual), expected_vect),

                                   u16_one_vect)));

      // Increment expected_vect by expected_adj

      expected_vect = Add(expected_vect, expected_adj);

      // Store the adjusted expected_vect back into expected

      Store(BitCast(dn, expected_vect), dn, expected.get());

      HWY_ASSERT_VEC_EQ(dn, expected.get(), actual);

#else

    (void)dn;

#endif

};

class TestIntegerOrderedDemote2To {

#if HWY_TARGET != HWY_SCALAR

 private:

  template <class T, class D, class DN>

  static void DoIntegerOrderedDemote2ToTest(DN dn, T /*t*/, D d) {

    using TN = TFromD<DN>;

    const size_t N = Lanes(d);

    const size_t twiceN = N * 2;

    auto from = AllocateAligned<T>(twiceN);

    auto expected = AllocateAligned<TN>(twiceN);

    HWY_ASSERT(from && expected);

    // Narrower range in the wider type, for clamping before we cast

    const T min = ConvertScalarTo<T>(IsSigned<T>() ? LimitsMin<TN>() : TN{0});

    const T max = LimitsMax<TN>();

    RandomState rng;

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < twiceN; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size

        expected[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, from[i]), max));

      const auto in_1 = Load(d, from.get());

      const auto in_2 = Load(d, from.get() + N);

      const auto actual = OrderedDemote2To(dn, in_1, in_2);

      HWY_ASSERT_VEC_EQ(dn, expected.get(), actual);

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {

      for (size_t i = 0; i < twiceN; ++i) {

        const uint64_t bits = rng();

        CopyBytes<sizeof(TN)>(&bits, &expected[i]);  // not same size

        if (!IsSigned<T>() && IsSigned<TN>()) {

          expected[i] &= static_cast<TN>(max);

        from[i] = ConvertScalarTo<T>(expected[i]);

      const auto in_1 = Load(d, from.get());

      const auto in_2 = Load(d, from.get() + N);

      const auto actual = OrderedDemote2To(dn, in_1, in_2);

      HWY_ASSERT_VEC_EQ(dn, expected.get(), actual);

#endif

 public:

  template <typename T, class D>

  HWY_NOINLINE void operator()(T /*t*/, D d) {

#if HWY_TARGET != HWY_SCALAR

    const RepartitionToNarrow<D> dn;

    const RebindToSigned<decltype(dn)> dn_i;

    const RebindToUnsigned<decltype(dn)> dn_u;

    DoIntegerOrderedDemote2ToTest(dn_i, T(), d);

    DoIntegerOrderedDemote2ToTest(dn_u, T(), d);

#else

    (void)d;

#endif

};

HWY_NOINLINE void TestAllOrderedDemote2To() {

  ForUI163264(ForShrinkableVectors<TestIntegerOrderedDemote2To>());

  ForShrinkableVectors<TestFloatOrderedDemote2To>()(bfloat16_t());

  // TODO(janwas): replace previous line with this once supported

  // ForSpecialTypes(ForShrinkableVectors<TestFloatOrderedDemote2To>());

struct TestI32F64 {

  template <typename TF, class DF>

  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {

    using TI = int32_t;

    const Rebind<TI, DF> di;

    const size_t N = Lanes(df);

    // Integer positive

    HWY_ASSERT_VEC_EQ(di, Iota(di, 4), DemoteTo(di, Iota(df, 4.0)));

    // Integer negative

    HWY_ASSERT_VEC_EQ(di, Iota(di, -static_cast<TI>(N)),

                      DemoteTo(di, Iota(df, -ConvertScalarTo<TF>(N))));

    // Above positive

    HWY_ASSERT_VEC_EQ(di, Iota(di, 2), DemoteTo(di, Iota(df, 2.001)));

    // Below positive

    HWY_ASSERT_VEC_EQ(di, Iota(di, 3), DemoteTo(di, Iota(df, 3.9999)));

    const TF eps = static_cast<TF>(0.0001);

    // Above negative

    HWY_ASSERT_VEC_EQ(

        di, Iota(di, -static_cast<TI>(N)),

        DemoteTo(di, Iota(df, -ConvertScalarTo<TF>(N + 1) + eps)));

    // Below negative

    HWY_ASSERT_VEC_EQ(

        di, Iota(di, -static_cast<TI>(N + 1)),

        DemoteTo(di, Iota(df, -ConvertScalarTo<TF>(N + 1) - eps)));

    // Huge positive float

    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),

                      DemoteTo(di, Set(df, TF(1E12))));

    // Huge negative float

    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),

                      DemoteTo(di, Set(df, TF(-1E12))));

};

HWY_NOINLINE void TestAllI32F64() {

#if HWY_HAVE_FLOAT64

  ForDemoteVectors<TestI32F64>()(double());

#endif

// NOLINTNEXTLINE(google-readability-namespace-comments)

}  // namespace HWY_NAMESPACE

}  // namespace hwy

HWY_AFTER_NAMESPACE();

#endif  //  !HWY_IS_MSAN

#if HWY_ONCE

namespace hwy {

#if !HWY_IS_MSAN

HWY_BEFORE_TEST(HwyDemoteTest);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteUI64ToFloat);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToBF16);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllOrderedDemote2To);

HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);

#endif  //  !HWY_IS_MSAN

}  // namespace hwy

#endif