test_util-inl.h - mozsearch

mozilla-central/third_party/highway/hwy/tests/test_util-inl.h

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: ImageLib

Revision control

Copy as Markdown

Other Tools

// Copyright 2019 Google LLC

// SPDX-License-Identifier: Apache-2.0

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

// Target-specific helper functions for use by *_test.cc.

#include <stdio.h>

#include <string.h>  // memset

// IWYU pragma: begin_exports

#include "hwy/aligned_allocator.h"

#include "hwy/base.h"

#include "hwy/detect_targets.h"

#include "hwy/per_target.h"

#include "hwy/targets.h"

#include "hwy/tests/hwy_gtest.h"

#include "hwy/tests/test_util.h"

// IWYU pragma: end_exports

// After test_util (also includes highway.h)

#include "hwy/print-inl.h"

// Per-target include guard

// clang-format off

#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)  // NOLINT

// clang-format on

#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_

#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_

#else

#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_

#endif

HWY_BEFORE_NAMESPACE();

namespace hwy {

namespace HWY_NAMESPACE {

// Like Iota, but avoids wrapping around to negative integers.

template <class D, HWY_IF_FLOAT_D(D)>

HWY_INLINE Vec<D> PositiveIota(D d) {

  return Iota(d, 1);

template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>

HWY_INLINE Vec<D> PositiveIota(D d) {

  const auto vi = Iota(d, 1);

  return Max(And(vi, Set(d, LimitsMax<TFromD<D>>())),

             Set(d, static_cast<TFromD<D>>(1)));

// Same as Iota, but supports bf16. This is possibly too expensive for general

// use, but fine for tests.

template <class D, typename First, HWY_IF_NOT_SPECIAL_FLOAT_D(D)>

VFromD<D> IotaForSpecial(D d, First first) {

  return Iota(d, first);

#if HWY_HAVE_FLOAT16

template <class D, typename First, HWY_IF_F16_D(D), HWY_IF_LANES_GT_D(D, 1)>

VFromD<D> IotaForSpecial(D d, First first) {

  return Iota(d, first);

#else   // !HWY_HAVE_FLOAT16

template <class D, typename First, HWY_IF_F16_D(D), HWY_IF_LANES_GT_D(D, 1),

          HWY_IF_POW2_GT_D(D, -1)>

VFromD<D> IotaForSpecial(D d, First first) {

  const Repartition<float, D> df;

  const size_t NW = Lanes(d) / 2;

  const Half<D> dh;

  const float first2 = static_cast<float>(first) + static_cast<float>(NW);

  return Combine(d, DemoteTo(dh, Iota(df, first2)),

                 DemoteTo(dh, Iota(df, first)));

  // TODO(janwas): enable when supported for f16

  // return OrderedDemote2To(d, Iota(df, first), Iota(df, first + NW));

// For partial vectors, a single f32 vector is enough, and the prior overload

// might not be able to Repartition.

template <class D, typename First, HWY_IF_F16_D(D), HWY_IF_LANES_GT_D(D, 1),

          HWY_IF_POW2_LE_D(D, -1)>

VFromD<D> IotaForSpecial(D d, First first) {

  const Rebind<float, D> df;

  return DemoteTo(d, Iota(df, first));

#endif  // HWY_HAVE_FLOAT16

template <class D, typename First, HWY_IF_BF16_D(D), HWY_IF_LANES_GT_D(D, 1),

          HWY_IF_POW2_GT_D(D, -1)>

VFromD<D> IotaForSpecial(D d, First first) {

  const Repartition<float, D> df;

  const float first1 = ConvertScalarTo<float>(first);

  const float first2 = first1 + static_cast<float>(Lanes(d) / 2);

  return OrderedDemote2To(d, Iota(df, first1), Iota(df, first2));

// For partial vectors, a single f32 vector is enough, and the prior overload

// might not be able to Repartition.

template <class D, typename First, HWY_IF_BF16_D(D), HWY_IF_LANES_GT_D(D, 1),

          HWY_IF_POW2_LE_D(D, -1)>

VFromD<D> IotaForSpecial(D d, First first) {

  const Rebind<float, D> df;

  return DemoteTo(d, Iota(df, first));

// OrderedDemote2To does not work for single lanes, so special-case that.

template <class D, typename First, HWY_IF_SPECIAL_FLOAT_D(D),

          HWY_IF_LANES_D(D, 1)>

VFromD<D> IotaForSpecial(D d, First first) {

  const Rebind<float, D> df;

  return DemoteTo(d, Set(df, static_cast<float>(first)));

// Compare expected array to vector.

// TODO(b/287462770): inline to work around incorrect SVE codegen.

template <class D, typename T = TFromD<D>>

HWY_INLINE void AssertVecEqual(D d, const T* expected, Vec<D> actual,

                               const char* filename, const int line) {

  const size_t N = Lanes(d);

  auto actual_lanes = AllocateAligned<T>(N);

  Store(actual, d, actual_lanes.get());

  const auto info = hwy::detail::MakeTypeInfo<T>();

  const char* target_name = hwy::TargetName(HWY_TARGET);

  hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,

                                target_name, filename, line);

// Compare expected vector to vector.

// TODO(b/287462770): inline to work around incorrect SVE codegen.

template <class D, typename T = TFromD<D>>

HWY_INLINE void AssertVecEqual(D d, Vec<D> expected, Vec<D> actual,

                               const char* filename, int line) {

  const size_t N = Lanes(d);

  auto expected_lanes = AllocateAligned<T>(N);

  auto actual_lanes = AllocateAligned<T>(N);

  Store(expected, d, expected_lanes.get());

  Store(actual, d, actual_lanes.get());

  const auto info = hwy::detail::MakeTypeInfo<T>();

  const char* target_name = hwy::TargetName(HWY_TARGET);

  hwy::detail::AssertArrayEqual(info, expected_lanes.get(), actual_lanes.get(),

                                N, target_name, filename, line);

// Only checks the valid mask elements (those whose index < Lanes(d)).

template <class D>

HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,

                                  const char* filename, int line) {

  // lvalues prevented MSAN failure in farm_sve.

  const Vec<D> va = VecFromMask(d, a);

  const Vec<D> vb = VecFromMask(d, b);

  AssertVecEqual(d, va, vb, filename, line);

  const char* target_name = hwy::TargetName(HWY_TARGET);

  AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);

  AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);

  AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);

  const size_t N = Lanes(d);

#if HWY_TARGET == HWY_SCALAR

  const Rebind<uint8_t, D> d8;

#else

  const Repartition<uint8_t, D> d8;

#endif

  const size_t N8 = Lanes(d8);

  auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(size_t{8}, N8));

  auto bits_b = AllocateAligned<uint8_t>(size_t{HWY_MAX(8, N8)});

  memset(bits_a.get(), 0, N8);

  memset(bits_b.get(), 0, N8);

  const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());

  const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());

  AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);

  size_t i = 0;

  // First check whole bytes (if that many elements are still valid)

  for (; i < N / 8; ++i) {

    if (bits_a[i] != bits_b[i]) {

      fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),

              bits_a[i], bits_b[i]);

      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);

      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);

      hwy::Abort(filename, line, "Masks not equal");

  // Then the valid bit(s) in the last byte.

  const size_t remainder = N % 8;

  if (remainder != 0) {

    const int mask = (1 << remainder) - 1;

    const int valid_a = bits_a[i] & mask;

    const int valid_b = bits_b[i] & mask;

    if (valid_a != valid_b) {

      fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",

              static_cast<int>(i), valid_a, valid_b);

      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);

      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);

      hwy::Abort(filename, line, "Masks not equal");

// Only sets valid elements (those whose index < Lanes(d)). This helps catch

// tests that are not masking off the (undefined) upper mask elements.

//

// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.

template <class D>

HWY_INLINE Mask<D> MaskTrue(const D d) {

  return FirstN(d, Lanes(d));

// MaskFalse is now implemented in x86_128-inl.h on AVX3, arm_sve-inl.h on SVE,

// rvv-inl.h on RVV, and generic_ops-inl.h on all other targets

#ifndef HWY_ASSERT_EQ

#define HWY_ASSERT_EQ(expected, actual)                                     \

  hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \

                   __LINE__)

#define HWY_ASSERT_ARRAY_EQ(expected, actual, count)                          \

  hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \

                        __FILE__, __LINE__)

#define HWY_ASSERT_STRING_EQ(expected, actual)                          \

  hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \

                         __FILE__, __LINE__)

#define HWY_ASSERT_VEC_EQ(d, expected, actual) \

  AssertVecEqual(d, expected, actual, __FILE__, __LINE__)

#define HWY_ASSERT_MASK_EQ(d, expected, actual) \

  AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)

#endif  // HWY_ASSERT_EQ

namespace detail {

// Helpers for instantiating tests with combinations of lane types / counts.

// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]

// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound

// is required to ensure capped vectors remain extendable. Implemented by

// recursively halving kMul until it is zero.

template <typename T, size_t kMul, size_t kMinArg, class Test, int kPow2 = 0>

struct ForeachCappedR {

  static void Do(size_t min_lanes, size_t max_lanes) {

    const CappedTag<T, kMul * kMinArg, kPow2> d;

    // If we already don't have enough lanes, stop.

    const size_t lanes = Lanes(d);

    if (lanes < min_lanes) return;

    if (lanes <= max_lanes) {

      Test()(T(), d);

    ForeachCappedR<T, kMul / 2, kMinArg, Test, kPow2>::Do(min_lanes, max_lanes);

};

// Base case to stop the recursion.

template <typename T, size_t kMinArg, class Test, int kPow2>

struct ForeachCappedR<T, 0, kMinArg, Test, kPow2> {

  static void Do(size_t, size_t) {}

};

#if HWY_HAVE_SCALABLE

template <typename T>

constexpr int MinPow2() {

  // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded

  // as kPow2 == -3). The fraction also must not result in zero lanes for the

  // smallest possible vector size, which is 128 bits even on RISC-V (with the

  // application processor profile).

  return HWY_MAX(-3, -static_cast<int>(CeilLog2(16 / sizeof(T))));

constexpr int MaxPow2() {

#if HWY_TARGET == HWY_RVV

  // Only RVV allows multiple vector registers.

  return 3;  // LMUL=8

#else

  // For all other platforms, we cannot exceed a full vector.

  return 0;

#endif

// Iterates kPow2 up to and including kMaxPow2. Below we specialize for

// valid=false to stop the iteration. The ForeachPow2Trim enables shorter

// argument lists, but use ForeachPow2 when you want to specify the actual min.

template <typename T, int kPow2, int kMaxPow2, bool valid, class Test>

struct ForeachPow2 {

  static void Do(size_t min_lanes) {

    const ScalableTag<T, kPow2> d;

    static_assert(MinPow2<T>() <= kPow2 && kPow2 <= MaxPow2(), "");

    if (Lanes(d) >= min_lanes) {

      Test()(T(), d);

    } else {

      fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",

              static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),

              static_cast<int>(sizeof(T)), kPow2);

      HWY_ASSERT(min_lanes != 1);

    ForeachPow2<T, kPow2 + 1, kMaxPow2, (kPow2 + 1) <= kMaxPow2, Test>::Do(

        min_lanes);

};

// Base case to stop the iteration.

template <typename T, int kPow2, int kMaxPow2, class Test>

struct ForeachPow2<T, kPow2, kMaxPow2, /*valid=*/false, Test> {

  static void Do(size_t) {}

};

// Iterates kPow2 over [MinPow2<T>() + kAddMin, MaxPow2() - kSubMax].

// This is a wrapper that shortens argument lists, allowing users to skip the

// MinPow2 and MaxPow2. Nonzero kAddMin implies a minimum LMUL, and nonzero

// kSubMax reduces the maximum LMUL (e.g. for type promotions, where the result

// is larger, thus the input cannot already use the maximum LMUL).

template <typename T, int kAddMin, int kSubMax, class Test>

using ForeachPow2Trim =

    ForeachPow2<T, MinPow2<T>() + kAddMin, MaxPow2() - kSubMax,

                MinPow2<T>() + kAddMin <= MaxPow2() - kSubMax, Test>;

#else

// ForeachCappedR already handled all possible sizes.

#endif  // HWY_HAVE_SCALABLE

}  // namespace detail

// These 'adapters' call a test for all possible N or kPow2 subject to

// constraints such as "vectors must be extendable" or "vectors >= 128 bits".

// They may be called directly, or via For*Types. Note that for an adapter C,

// `C<Test>(T())` does not call the test - the correct invocation is

// `C<Test>()(T())`, or preferably `ForAllTypes(C<Test>())`. We check at runtime

// that operator() is called to prevent such bugs. Note that this is not

// thread-safe, but that is fine because C are typically local variables.

// Calls Test for all powers of two in [1, Lanes(d) * (RVV? 2 : 1) ]. For

// interleaved_test; RVV segments are limited to 8 registers, so we can only go

// up to LMUL=2.

template <class Test>

class ForMaxPow2 {

  mutable bool called_ = false;

 public:

  ~ForMaxPow2() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

#if HWY_TARGET == HWY_SCALAR

    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);

#else

    detail::ForeachCappedR<T, HWY_LANES(T), 1, Test>::Do(

        1, Lanes(ScalableTag<T>()));

#if HWY_TARGET == HWY_RVV

    // To get LMUL=2 (kPow2=1), 2 is what we subtract from MaxPow2()=3.

    detail::ForeachPow2Trim<T, 0, 2, Test>::Do(1);

#elif HWY_HAVE_SCALABLE

    detail::ForeachPow2Trim<T, 0, 0, Test>::Do(1);

#endif

#endif  // HWY_TARGET == HWY_SCALAR

};

// Calls Test for all powers of two in [1, Lanes(d) >> kPow2]. This is for

// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).

template <class Test, int kPow2 = 1>

class ForExtendableVectors {

  mutable bool called_ = false;

 public:

  ~ForExtendableVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

    constexpr size_t kMaxCapped = HWY_LANES(T);

    // Skip CappedTag that are already full vectors.

    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;

    (void)kMaxCapped;

    (void)max_lanes;

#if HWY_TARGET == HWY_SCALAR

    // not supported

#else

    constexpr size_t kMul = kMaxCapped >> kPow2;

    constexpr size_t kMinArg = size_t{1} << kPow2;

    detail::ForeachCappedR<T, kMul, kMinArg, Test, -kPow2>::Do(1, max_lanes);

#if HWY_HAVE_SCALABLE

    detail::ForeachPow2Trim<T, 0, kPow2, Test>::Do(1);

#endif

#endif  // HWY_SCALAR

};

// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops

// that narrow their input, e.g. UpperHalf.

template <class Test, int kPow2 = 1>

class ForShrinkableVectors {

  mutable bool called_ = false;

 public:

  ~ForShrinkableVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

    constexpr size_t kMinLanes = size_t{1} << kPow2;

    constexpr size_t kMaxCapped = HWY_LANES(T);

    // For shrinking, an upper limit is unnecessary.

    constexpr size_t max_lanes = kMaxCapped;

    (void)kMinLanes;

    (void)max_lanes;

    (void)max_lanes;

#if HWY_TARGET == HWY_SCALAR

    // not supported

#elif HWY_HAVE_SCALABLE

    detail::ForeachPow2Trim<T, kPow2, 0, Test>::Do(kMinLanes);

#else

    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(

        kMinLanes, max_lanes);

#endif  // HWY_TARGET == HWY_SCALAR

};

// Calls Test for all supported power of two vectors of at least kMinBits.

// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.

template <size_t kMinBits, class Test>

class ForGEVectors {

  mutable bool called_ = false;

 public:

  ~ForGEVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

    constexpr size_t kMaxCapped = HWY_LANES(T);

    constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);

    // An upper limit is unnecessary.

    constexpr size_t max_lanes = kMaxCapped;

    (void)max_lanes;

#if HWY_TARGET == HWY_SCALAR

    (void)kMinLanes;  // not supported

#else

    detail::ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(

        kMinLanes, max_lanes);

#if HWY_HAVE_SCALABLE

    // Can be 0 (handled below) if kMinBits > 128.

    constexpr size_t kRatio = 128 / kMinBits;

    constexpr int kMinPow2 =

        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));

    constexpr bool kValid = kMinPow2 <= detail::MaxPow2();

    detail::ForeachPow2<T, kMinPow2, detail::MaxPow2(), kValid, Test>::Do(

        kMinLanes);

#endif

#endif  // HWY_TARGET == HWY_SCALAR

};

template <class Test>

using ForGE128Vectors = ForGEVectors<128, Test>;

// Calls Test for all N that can be promoted (not the same as Extendable because

// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.

template <class Test, int kPow2 = 1>

class ForPromoteVectors {

  mutable bool called_ = false;

 public:

  ~ForPromoteVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

    constexpr size_t kFactor = size_t{1} << kPow2;

    static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");

    constexpr size_t kMaxCapped = HWY_LANES(T);

    // Skip CappedTag that are already full vectors.

    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;

    (void)kMaxCapped;

    (void)max_lanes;

#if HWY_TARGET == HWY_SCALAR

    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);

#else

    using DLargestFrom = CappedTag<T, (kMaxCapped >> kPow2) * kFactor, -kPow2>;

    static_assert(HWY_MAX_LANES_D(DLargestFrom) <= (kMaxCapped >> kPow2),

                  "HWY_MAX_LANES_D(DLargestFrom) must be less than or equal to "

                  "(kMaxCapped >> kPow2)");

    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kFactor, Test, -kPow2>::Do(

        1, max_lanes);

#if HWY_HAVE_SCALABLE

    detail::ForeachPow2Trim<T, 0, kPow2, Test>::Do(1);

#endif

#endif  // HWY_SCALAR

};

// Calls Test for all N than can be demoted (not the same as Shrinkable because

// HWY_SCALAR has one lane and as a one-lane vector with a lane size of at least

// 2 bytes can always be demoted to a vector with a smaller lane type).

template <class Test, int kPow2 = 1>

class ForDemoteVectors {

  mutable bool called_ = false;

 public:

  ~ForDemoteVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

#if HWY_HAVE_SCALABLE

    // kMinTVecPow2 is the smallest Pow2 for a vector with lane type T that is

    // supported by detail::ForeachPow2Trim

    constexpr int kMinTVecPow2 = detail::MinPow2<T>();

    // detail::MinPow2<T>() + kMinPow2Adj is the smallest Pow2 for a vector with

    // lane type T that can be demoted to a vector with a lane size of

    // (sizeof(T) >> kPow2)

    constexpr int kMinPow2Adj = HWY_MAX(-3 - kMinTVecPow2 + kPow2, 0);

    detail::ForeachPow2Trim<T, kMinPow2Adj, 0, Test>::Do(1);

    // On targets with scalable vectors, detail::ForeachCappedR below only

    // needs to be executed for vectors that have less than

    // Lanes(ScalableTag<T>()) as full vectors were already checked by the

    // detail::ForeachPow2Trim above.

    constexpr size_t kMaxCapped = HWY_LANES(T) >> 1;

    const size_t max_lanes = Lanes(ScalableTag<T>()) >> 1;

#else

    // On targets where HWY_HAVE_SCALABLE is 0, any vector with HWY_LANES(T)

    // or fewer lanes can always be demoted to a vector with a smaller lane

    // type.

    constexpr size_t kMaxCapped = HWY_LANES(T);

    const size_t max_lanes = kMaxCapped;

#endif

    detail::ForeachCappedR<T, kMaxCapped, 1, Test>::Do(1, max_lanes);

};

// For LowerHalf/Quarter.

template <class Test, int kPow2 = 1>

class ForHalfVectors {

  mutable bool called_ = false;

 public:

  ~ForHalfVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*unused*/) const {

    called_ = true;

#if HWY_TARGET == HWY_SCALAR

    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);

#else

    constexpr size_t kMinLanes = size_t{1} << kPow2;

    // For shrinking, an upper limit is unnecessary.

    constexpr size_t kMaxCapped = HWY_LANES(T);

    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(

        kMinLanes, kMaxCapped);

// TODO(janwas): call Extendable if kMinLanes check not required?

#if HWY_HAVE_SCALABLE

    detail::ForeachPow2Trim<T, kPow2, 0, Test>::Do(kMinLanes);

#endif

#endif  // HWY_TARGET == HWY_SCALAR

};

// Calls Test for all power of two N in [1, Lanes(d)]. This is the default

// for ops that do not narrow nor widen their input, nor require 128 bits.

template <class Test>

class ForPartialVectors {

  mutable bool called_ = false;

 public:

  ~ForPartialVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T t) const {

    called_ = true;

#if HWY_TARGET == HWY_SCALAR

    (void)t;

    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);

#else

    ForExtendableVectors<Test, 0>()(t);

#endif

};

// ForPartialFixedOrFullScalableVectors calls Test for each D where

// MaxLanes(D()) == MaxLanes(DFromV<VFromD<D>>())

#if HWY_HAVE_SCALABLE

template <class Test>

class ForPartialFixedOrFullScalableVectors {

  mutable bool called_ = false;

 public:

  ~ForPartialFixedOrFullScalableVectors() {

    if (!called_) {

      HWY_ABORT("Test is incorrect, ensure operator() is called");

  template <typename T>

  void operator()(T /*t*/) const {

    called_ = true;

#if HWY_TARGET == HWY_RVV

    constexpr int kMinPow2 = -3 + static_cast<int>(CeilLog2(sizeof(T)));

    constexpr int kMaxPow2 = 3;

#else

    constexpr int kMinPow2 = 0;

    constexpr int kMaxPow2 = 0;

#endif

    detail::ForeachPow2<T, kMinPow2, kMaxPow2, true, Test>::Do(1);

};

#elif HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128

template <class Test>

using ForPartialFixedOrFullScalableVectors =

    ForGEVectors<HWY_MAX_BYTES * 8, Test>;

#else

template <class Test>

using ForPartialFixedOrFullScalableVectors = ForPartialVectors<Test>;

#endif

// Type lists to shorten call sites:

template <class Func>

void ForSignedTypes(const Func& func) {

  func(int8_t());

  func(int16_t());

  func(int32_t());

#if HWY_HAVE_INTEGER64

  func(int64_t());

#endif

template <class Func>

void ForUnsignedTypes(const Func& func) {

  func(uint8_t());

  func(uint16_t());

  func(uint32_t());

#if HWY_HAVE_INTEGER64

  func(uint64_t());

#endif

template <class Func>

void ForIntegerTypes(const Func& func) {

  ForSignedTypes(func);

  ForUnsignedTypes(func);

template <class Func>

void ForFloat16Types(const Func& func) {

#if HWY_HAVE_FLOAT16

  func(float16_t());

#else

  (void)func;

#endif

template <class Func>

void ForFloat64Types(const Func& func) {

#if HWY_HAVE_FLOAT64

  func(double());

#else

  (void)func;

#endif

// `#if HWY_HAVE_FLOAT*` is sufficient for tests using static dispatch. In

// sort_test we also use dynamic dispatch, so there we call the For*Dynamic

// functions which also check hwy::HaveFloat*.

template <class Func>

void ForFloat16TypesDynamic(const Func& func) {

#if HWY_HAVE_FLOAT16

  if (hwy::HaveFloat16()) {

    func(float16_t());

#else

  (void)func;

#endif

template <class Func>

void ForFloat64TypesDynamic(const Func& func) {

#if HWY_HAVE_FLOAT64

  if (hwy::HaveFloat64()) {

    func(double());

#else

  (void)func;

#endif

template <class Func>

void ForFloat3264Types(const Func& func) {

  func(float());

  ForFloat64Types(func);

template <class Func>

void ForFloatTypes(const Func& func) {

  ForFloat16Types(func);

  ForFloat3264Types(func);

template <class Func>

void ForFloatTypesDynamic(const Func& func) {

  ForFloat16TypesDynamic(func);

  func(float());

  ForFloat64TypesDynamic(func);

template <class Func>

void ForAllTypes(const Func& func) {

  ForIntegerTypes(func);

  ForFloatTypes(func);

// For ops that are also unconditionally available for bfloat16_t/float16_t.

template <class Func>

void ForSpecialTypes(const Func& func) {

  func(float16_t());

  func(bfloat16_t());

template <class Func>

void ForAllTypesAndSpecial(const Func& func) {

  ForAllTypes(func);

  ForSpecialTypes(func);

template <class Func>

void ForUI8(const Func& func) {

  func(uint8_t());

  func(int8_t());

template <class Func>

void ForUI16(const Func& func) {

  func(uint16_t());

  func(int16_t());

template <class Func>

void ForUIF16(const Func& func) {

  ForUI16(func);

  ForFloat16Types(func);

template <class Func>

void ForUI32(const Func& func) {

  func(uint32_t());

  func(int32_t());

template <class Func>

void ForUIF32(const Func& func) {

  ForUI32(func);

  func(float());

template <class Func>

void ForUI64(const Func& func) {

#if HWY_HAVE_INTEGER64

  func(uint64_t());

  func(int64_t());

#endif

template <class Func>

void ForUIF64(const Func& func) {

  ForUI64(func);

  ForFloat64Types(func);

template <class Func>

void ForUI3264(const Func& func) {

  ForUI32(func);

  ForUI64(func);

template <class Func>

void ForUIF3264(const Func& func) {

  ForUIF32(func);

  ForUIF64(func);

template <class Func>

void ForU816(const Func& func) {

  func(uint8_t());

  func(uint16_t());

template <class Func>

void ForI816(const Func& func) {

  func(int8_t());

  func(int16_t());

template <class Func>

void ForU163264(const Func& func) {

  func(uint16_t());

  func(uint32_t());

#if HWY_HAVE_INTEGER64

  func(uint64_t());

#endif

template <class Func>

void ForUI163264(const Func& func) {

  ForUI16(func);

  ForUI3264(func);

template <class Func>

void ForUIF163264(const Func& func) {

  ForUIF16(func);

  ForUIF3264(func);

// For tests that involve loops, adjust the trip count so that emulated tests

// finish quickly (but always at least 2 iterations to ensure some diversity).

constexpr size_t AdjustedReps(size_t max_reps) {

#if HWY_ARCH_RVV

  return HWY_MAX(max_reps / 32, 2);

#elif HWY_IS_DEBUG_BUILD

  return HWY_MAX(max_reps / 8, 2);

#elif HWY_ARCH_ARM

  return HWY_MAX(max_reps / 4, 2);

#elif HWY_COMPILER_MSVC

  return HWY_MAX(max_reps / 2, 2);

#else

  return HWY_MAX(max_reps, 2);

#endif

// Same as above, but the loop trip count will be 1 << max_pow2.

constexpr size_t AdjustedLog2Reps(size_t max_pow2) {

  // If "negative" (unsigned wraparound), use original.

#if HWY_ARCH_RVV

  return HWY_MIN(max_pow2 - 4, max_pow2);

#elif HWY_IS_DEBUG_BUILD

  return HWY_MIN(max_pow2 - 1, max_pow2);

#elif HWY_ARCH_ARM

  return HWY_MIN(max_pow2 - 1, max_pow2);

#else

  return max_pow2;

#endif

// NOLINTNEXTLINE(google-readability-namespace-comments)

}  // namespace HWY_NAMESPACE

}  // namespace hwy

HWY_AFTER_NAMESPACE();

#endif  // per-target include guard