Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
#include "hwy/contrib/dot/dot-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T1, typename T2>
HWY_NOINLINE T1 SimpleDot(const T1* pa, const T2* pb, size_t num) {
float sum = 0.0f;
for (size_t i = 0; i < num; ++i) {
sum += ConvertScalarTo<float>(pa[i]) * ConvertScalarTo<float>(pb[i]);
}
return ConvertScalarTo<T1>(sum);
}
HWY_NOINLINE float SimpleDot(const float* pa, const hwy::bfloat16_t* pb,
size_t num) {
float sum = 0.0f;
for (size_t i = 0; i < num; ++i) {
sum += pa[i] * F32FromBF16(pb[i]);
}
return sum;
}
// Overload is required because the generic template hits an internal compiler
// error on aarch64 clang.
HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
size_t num) {
float sum = 0.0f;
for (size_t i = 0; i < num; ++i) {
sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
}
return sum;
}
class TestDot {
// Computes/verifies one dot product.
template <int kAssumptions, class D>
void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
const size_t N = Lanes(d);
const auto random_t = [&rng]() {
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
return static_cast<float>(bits - 512) * (1.0f / 64);
};
const size_t padded =
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
HWY_ASSERT(pa && pb);
T* a = pa.get() + misalign_a;
T* b = pb.get() + misalign_b;
size_t i = 0;
for (; i < num; ++i) {
a[i] = ConvertScalarTo<T>(random_t());
b[i] = ConvertScalarTo<T>(random_t());
}
// Fill padding with NaN - the values are not used, but avoids MSAN errors.
for (; i < padded; ++i) {
ScalableTag<float> df1;
a[i] = ConvertScalarTo<T>(GetLane(NaN(df1)));
b[i] = ConvertScalarTo<T>(GetLane(NaN(df1)));
}
const double expected = SimpleDot(a, b, num);
const double magnitude = expected > 0.0 ? expected : -expected;
const double actual =
ConvertScalarTo<double>(Dot::Compute<kAssumptions>(d, a, b, num));
const double max = static_cast<double>(8 * 8 * num);
HWY_ASSERT(-max <= actual && actual <= max);
const double tolerance =
64.0 * ConvertScalarTo<double>(Epsilon<T>()) * HWY_MAX(magnitude, 1.0);
HWY_ASSERT(expected - tolerance <= actual &&
actual <= expected + tolerance);
}
// Runs tests with various alignments.
template <int kAssumptions, class D>
void ForeachMisalign(D d, size_t num, RandomState& rng) {
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
for (size_t ma : misalignments) {
for (size_t mb : misalignments) {
Test<kAssumptions>(d, num, ma, mb, rng);
}
}
}
// Runs tests with various lengths compatible with the given assumptions.
template <int kAssumptions, class D>
void ForeachCount(D d, RandomState& rng) {
const size_t N = Lanes(d);
const size_t counts[] = {1,
3,
7,
16,
HWY_MAX(N / 2, 1),
HWY_MAX(2 * N / 3, 1),
N,
N + 1,
4 * N / 3,
3 * N,
8 * N,
8 * N + 2};
for (size_t num : counts) {
if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
ForeachMisalign<kAssumptions>(d, num, rng);
}
}
public:
// Must be inlined on aarch64 for bf16, else clang crashes.
template <class T, class D>
HWY_INLINE void operator()(T /*unused*/, D d) {
RandomState rng;
// All 8 combinations of the three length-related flags:
ForeachCount<0>(d, rng);
ForeachCount<Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
Dot::kAtLeastOneVector>(d, rng);
}
};
class TestDotF32BF16 {
// Computes/verifies one dot product.
template <int kAssumptions, class D>
void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
RandomState& rng) {
using T = TFromD<D>;
using T2 = hwy::bfloat16_t;
const size_t N = Lanes(d);
const auto random_t = [&rng]() {
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
return static_cast<float>(bits - 512) * (1.0f / 64);
};
const size_t padded =
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
AlignedFreeUniquePtr<T2[]> pb = AllocateAligned<T2>(misalign_b + padded);
HWY_ASSERT(pa && pb);
T* a = pa.get() + misalign_a;
T2* b = pb.get() + misalign_b;
size_t i = 0;
for (; i < num; ++i) {
a[i] = ConvertScalarTo<T>(random_t());
b[i] = ConvertScalarTo<T2>(random_t());
}
// Fill padding with NaN - the values are not used, but avoids MSAN errors.
for (; i < padded; ++i) {
ScalableTag<float> df1;
a[i] = ConvertScalarTo<T>(GetLane(NaN(df1)));
b[i] = ConvertScalarTo<T2>(GetLane(NaN(df1)));
}
const double expected = SimpleDot(a, b, num);
const double magnitude = expected > 0.0 ? expected : -expected;
const double actual =
ConvertScalarTo<double>(Dot::Compute<kAssumptions>(d, a, b, num));
const double max = static_cast<double>(8 * 8 * num);
HWY_ASSERT(-max <= actual && actual <= max);
const double tolerance =
64.0 * ConvertScalarTo<double>(Epsilon<T2>()) * HWY_MAX(magnitude, 1.0);
HWY_ASSERT(expected - tolerance <= actual &&
actual <= expected + tolerance);
}
// Runs tests with various alignments.
template <int kAssumptions, class D>
void ForeachMisalign(D d, size_t num, RandomState& rng) {
const size_t N = Lanes(d);
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
for (size_t ma : misalignments) {
for (size_t mb : misalignments) {
Test<kAssumptions>(d, num, ma, mb, rng);
}
}
}
// Runs tests with various lengths compatible with the given assumptions.
template <int kAssumptions, class D>
void ForeachCount(D d, RandomState& rng) {
const size_t N = Lanes(d);
const size_t counts[] = {1,
3,
7,
16,
HWY_MAX(N / 2, 1),
HWY_MAX(2 * N / 3, 1),
N,
N + 1,
4 * N / 3,
3 * N,
8 * N,
8 * N + 2};
for (size_t num : counts) {
if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
ForeachMisalign<kAssumptions>(d, num, rng);
}
}
public:
// Must be inlined on aarch64 for bf16, else clang crashes.
template <class T, class D>
HWY_INLINE void operator()(T /*unused*/, D d) {
RandomState rng;
// All 8 combinations of the three length-related flags:
ForeachCount<0>(d, rng);
ForeachCount<Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
Dot::kAtLeastOneVector>(d, rng);
}
};
// All floating-point types, both arguments same.
void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
// Mixed f32 and bf16.
void TestAllDotF32BF16() {
ForPartialVectors<TestDotF32BF16> test;
test(float());
}
// Both bf16.
void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(DotTest);
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotF32BF16);
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
} // namespace hwy
#endif