profiler.h - mozsearch

mozilla-central/third_party/highway/hwy/profiler.h

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: ImageLib

Revision control

Copy as Markdown

Other Tools

// Copyright 2017 Google Inc. All Rights Reserved.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//     http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#ifndef HIGHWAY_HWY_PROFILER_H_

#define HIGHWAY_HWY_PROFILER_H_

// High precision, low overhead time measurements. Returns exact call counts and

// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).

//

// Uses RAII to capture begin/end timestamps, with user-specified zone names:

//   { PROFILER_ZONE("name"); /*code*/ } or

// the name of the current function:

//   void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.

//

// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to

// print call counts and average durations [CPU cycles] to stdout, sorted in

// descending order of total duration.

//

// The binary MUST be built with --dynamic_mode=off because we rely on the data

// segments being nearby; if not, an assertion will likely fail.

#include "hwy/base.h"

// Configuration settings:

// If zero, this file has no effect and no measurements will be recorded.

#ifndef PROFILER_ENABLED

#define PROFILER_ENABLED 0

#endif

// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that

// enters at least one zone. Once this buffer is full, the thread will analyze

// and discard packets, thus temporarily adding some observer overhead.

// Each zone occupies 16 bytes.

#ifndef PROFILER_THREAD_STORAGE

#define PROFILER_THREAD_STORAGE 200ULL

#endif

#if PROFILER_ENABLED || HWY_IDE

#include <stddef.h>

#include <stdint.h>

#include <stdio.h>

#include <string.h>  // strcmp

#include <algorithm>  // std::sort

#include <atomic>

#include "hwy/aligned_allocator.h"

#include "hwy/cache_control.h"  // FlushStream

// #include "hwy/contrib/sort/vqsort.h"

#include "hwy/highway.h"  // Stream

#include "hwy/robust_statistics.h"

#include "hwy/timer-inl.h"

#include "hwy/timer.h"

#define PROFILER_PRINT_OVERHEAD 0

namespace hwy {

// Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):

// How many threads can actually enter a zone (those that don't do not count).

// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.

// WARNING: a fiber library can spawn hundreds of threads.

static constexpr size_t kMaxThreads = 256;

static constexpr size_t kMaxDepth = 64;  // Maximum nesting of zones.

static constexpr size_t kMaxZones = 256;  // Total number of zones.

// Overwrites "to" without loading it into the cache (read-for-ownership).

// Both pointers must be aligned.

HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,

                                     uint64_t* HWY_RESTRICT to) {

  namespace hn = HWY_NAMESPACE;

  const hn::ScalableTag<uint64_t> d;

  for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {

    hn::Stream(hn::Load(d, from + i), d, to + i);

#pragma pack(push, 1)

// Represents zone entry/exit events. Stores a full-resolution timestamp plus

// an offset (representing zone name or identifying exit packets). POD.

class Packet {

 public:

  // If offsets do not fit, UpdateOrAdd will overrun our heap allocation

  // (governed by kMaxZones). We have seen multi-megabyte offsets.

  static constexpr size_t kOffsetBits = 25;

  static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);

  // We need full-resolution timestamps; at an effective rate of 4 GHz,

  // this permits 1 minute zone durations (for longer durations, split into

  // multiple zones). Wraparound is handled by masking.

  static constexpr size_t kTimestampBits = 64 - kOffsetBits;

  static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;

  static Packet Make(const size_t biased_offset, const uint64_t timestamp) {

    HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));

    Packet packet;

    packet.bits_ =

        (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);

    return packet;

  uint64_t Timestamp() const { return bits_ & kTimestampMask; }

  size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }

 private:

  uint64_t bits_;

};

static_assert(sizeof(Packet) == 8, "Wrong Packet size");

// Returns the address of a string literal. Assuming zone names are also

// literals and stored nearby, we can represent them as offsets, which are

// faster to compute than hashes or even a static index.

//

// This function must not be static - each call (even from other translation

// units) must return the same value.

inline const char* StringOrigin() {

  // Chosen such that no zone name is a prefix nor suffix of this string

  // to ensure they aren't merged (offset 0 identifies zone-exit packets).

  static const char* string_origin = "__#__";

  return string_origin - Packet::kOffsetBias;

// Representation of an active zone, stored in a stack. Used to deduct

// child duration from the parent's self time. POD.

struct Node {

  Packet packet;

  uint64_t child_total;

};

static_assert(sizeof(Node) == 16, "Wrong Node size");

// Holds statistics for all zones with the same name. POD.

struct Accumulator {

  static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;

  uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }

  uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }

  uint64_t Duration() const { return u128.hi; }

  void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {

    u128.hi = duration;

    u128.lo = (biased_offset << kNumCallBits) + num_calls;

  void Add(uint64_t num_calls, uint64_t duration) {

    u128.lo += num_calls;

    u128.hi += duration;

  // For fast sorting by duration, which must therefore be the hi element.

  // lo holds BiasedOffset and NumCalls.

  uint128_t u128;

};

static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");

template <typename T>

inline T ClampedSubtract(const T minuend, const T subtrahend) {

  if (subtrahend > minuend) {

    return 0;

  return minuend - subtrahend;

// Per-thread call graph (stack) and Accumulator for each zone.

class Results {

 public:

  Results() { ZeroBytes(zones_, sizeof(zones_)); }

  // Used for computing overhead when this thread encounters its first Zone.

  // This has no observable effect apart from increasing "analyze_elapsed_".

  uint64_t ZoneDuration(const Packet* packets) {

    HWY_DASSERT(depth_ == 0);

    HWY_DASSERT(num_zones_ == 0);

    AnalyzePackets(packets, 2);

    const uint64_t duration = zones_[0].Duration();

    zones_[0].Set(0, 0, 0);

    HWY_DASSERT(depth_ == 0);

    num_zones_ = 0;

    return duration;

  void SetSelfOverhead(const uint64_t self_overhead) {

    self_overhead_ = self_overhead;

  void SetChildOverhead(const uint64_t child_overhead) {

    child_overhead_ = child_overhead;

  // Draw all required information from the packets, which can be discarded

  // afterwards. Called whenever this thread's storage is full.

  void AnalyzePackets(const Packet* packets, const size_t num_packets) {

    namespace hn = HWY_NAMESPACE;

    const uint64_t t0 = hn::timer::Start();

    for (size_t i = 0; i < num_packets; ++i) {

      const Packet p = packets[i];

      // Entering a zone

      if (p.BiasedOffset() != Packet::kOffsetBias) {

        HWY_DASSERT(depth_ < kMaxDepth);

        nodes_[depth_].packet = p;

        nodes_[depth_].child_total = 0;

        ++depth_;

        continue;

      HWY_DASSERT(depth_ != 0);

      const Node& node = nodes_[depth_ - 1];

      // Masking correctly handles unsigned wraparound.

      const uint64_t duration =

          (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;

      const uint64_t self_duration = ClampedSubtract(

          duration, self_overhead_ + child_overhead_ + node.child_total);

      UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);

      --depth_;

      // Deduct this nested node's time from its parent's self_duration.

      if (depth_ != 0) {

        nodes_[depth_ - 1].child_total += duration + child_overhead_;

    const uint64_t t1 = hn::timer::Stop();

    analyze_elapsed_ += t1 - t0;

  // Incorporates results from another thread. Call after all threads have

  // exited any zones.

  void Assimilate(const Results& other) {

    namespace hn = HWY_NAMESPACE;

    const uint64_t t0 = hn::timer::Start();

    HWY_DASSERT(depth_ == 0);

    HWY_DASSERT(other.depth_ == 0);

    for (size_t i = 0; i < other.num_zones_; ++i) {

      const Accumulator& zone = other.zones_[i];

      UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());

    const uint64_t t1 = hn::timer::Stop();

    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;

  // Single-threaded.

  void Print() {

    namespace hn = HWY_NAMESPACE;

    const uint64_t t0 = hn::timer::Start();

    MergeDuplicates();

    // Sort by decreasing total (self) cost.

    // VQSort(&zones_[0].u128, num_zones_, SortDescending());

    std::sort(zones_, zones_ + num_zones_,

              [](const Accumulator& r1, const Accumulator& r2) {

                return r1.Duration() > r2.Duration();

});

    const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();

    const char* string_origin = StringOrigin();

    for (size_t i = 0; i < num_zones_; ++i) {

      const Accumulator& r = zones_[i];

      const uint64_t num_calls = r.NumCalls();

      printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),

             num_calls, r.Duration() / num_calls,

             static_cast<double>(r.Duration()) * inv_freq);

    const uint64_t t1 = hn::timer::Stop();

    analyze_elapsed_ += t1 - t0;

    printf("Total analysis [s]: %f\n",

           static_cast<double>(analyze_elapsed_) * inv_freq);

 private:

  // Updates an existing Accumulator (uniquely identified by biased_offset) or

  // adds one if this is the first time this thread analyzed that zone.

  // Uses a self-organizing list data structure, which avoids dynamic memory

  // allocations and is far faster than unordered_map. Loads, updates and

  // stores the entire Accumulator with vector instructions.

  void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,

                   const uint64_t duration) {

    HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));

    // Special case for first zone: (maybe) update, without swapping.

    if (zones_[0].BiasedOffset() == biased_offset) {

      zones_[0].Add(num_calls, duration);

      HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);

      return;

    // Look for a zone with the same offset.

    for (size_t i = 1; i < num_zones_; ++i) {

      if (zones_[i].BiasedOffset() == biased_offset) {

        zones_[i].Add(num_calls, duration);

        HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);

        // Swap with predecessor (more conservative than move to front,

        // but at least as successful).

        const Accumulator prev = zones_[i - 1];

        zones_[i - 1] = zones_[i];

        zones_[i] = prev;

        return;

    // Not found; create a new Accumulator.

    HWY_DASSERT(num_zones_ < kMaxZones);

    Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;

    zone->Set(biased_offset, num_calls, duration);

    HWY_DASSERT(zone->BiasedOffset() == biased_offset);

    ++num_zones_;

  // Each instantiation of a function template seems to get its own copy of

  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is

  // acceptable because we only expect a few dozen zones.

  void MergeDuplicates() {

    const char* string_origin = StringOrigin();

    for (size_t i = 0; i < num_zones_; ++i) {

      const size_t biased_offset = zones_[i].BiasedOffset();

      const char* name = string_origin + biased_offset;

      // Separate num_calls from biased_offset so we can add them together.

      uint64_t num_calls = zones_[i].NumCalls();

      // Add any subsequent duplicates to num_calls and total_duration.

      for (size_t j = i + 1; j < num_zones_;) {

        if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {

          num_calls += zones_[j].NumCalls();

          zones_[i].Add(0, zones_[j].Duration());

          // Fill hole with last item.

          zones_[j] = zones_[--num_zones_];

        } else {  // Name differed, try next Accumulator.

          ++j;

      HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));

      // Re-pack regardless of whether any duplicates were found.

      zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());

  uint64_t analyze_elapsed_ = 0;

  uint64_t self_overhead_ = 0;

  uint64_t child_overhead_ = 0;

  size_t depth_ = 0;      // Number of active zones.

  size_t num_zones_ = 0;  // Number of retired zones.

  alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth];         // Stack

  alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones];  // Self-organizing list

};

// Per-thread packet storage, dynamically allocated.

class ThreadSpecific {

  static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);

 public:

  // "name" is used to sanity-check offsets fit in kOffsetBits.

  explicit ThreadSpecific(const char* name)

      : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),

        packets_(AllocateAligned<Packet>(max_packets_)),

        num_packets_(0),

        string_origin_(StringOrigin()) {

    // Even in optimized builds, verify that this zone's name offset fits

    // within the allotted space. If not, UpdateOrAdd is likely to overrun

    // zones_[]. Checking here on the cold path (only reached once per thread)

    // is cheap, but it only covers one zone.

    const size_t biased_offset = name - string_origin_;

    HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));

  // Depends on Zone => defined below.

  void ComputeOverhead();

  void WriteEntry(const char* name, const uint64_t timestamp) {

    const size_t biased_offset = name - string_origin_;

    Write(Packet::Make(biased_offset, timestamp));

  void WriteExit(const uint64_t timestamp) {

    const size_t biased_offset = Packet::kOffsetBias;

    Write(Packet::Make(biased_offset, timestamp));

  void AnalyzeRemainingPackets() {

    // Ensures prior weakly-ordered streaming stores are globally visible.

    FlushStream();

    // Storage full => empty it.

    if (num_packets_ + buffer_size_ > max_packets_) {

      results_.AnalyzePackets(packets_.get(), num_packets_);

      num_packets_ = 0;

    CopyBytes(buffer_, packets_.get() + num_packets_,

              buffer_size_ * sizeof(Packet));

    num_packets_ += buffer_size_;

    results_.AnalyzePackets(packets_.get(), num_packets_);

    num_packets_ = 0;

  Results& GetResults() { return results_; }

 private:

  // Write packet to buffer/storage, emptying them as needed.

  void Write(const Packet packet) {

    // Buffer full => copy to storage.

    if (buffer_size_ == kBufferCapacity) {

      // Storage full => empty it.

      if (num_packets_ + kBufferCapacity > max_packets_) {

        results_.AnalyzePackets(packets_.get(), num_packets_);

        num_packets_ = 0;

      // This buffering halves observer overhead and decreases the overall

      // runtime by about 3%. Casting is safe because the first member is u64.

      StreamCacheLine(

          reinterpret_cast<const uint64_t*>(buffer_),

          reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));

      num_packets_ += kBufferCapacity;

      buffer_size_ = 0;

    buffer_[buffer_size_] = packet;

    ++buffer_size_;

  // Write-combining buffer to avoid cache pollution. Must be the first

  // non-static member to ensure cache-line alignment.

  Packet buffer_[kBufferCapacity];

  size_t buffer_size_ = 0;

  const size_t max_packets_;

  // Contiguous storage for zone enter/exit packets.

  AlignedFreeUniquePtr<Packet[]> packets_;

  size_t num_packets_;

  // Cached here because we already read this cache line on zone entry/exit.

  const char* HWY_RESTRICT string_origin_;

  Results results_;

};

class ThreadList {

 public:

  // Called from any thread.

  ThreadSpecific* Add(const char* name) {

    const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);

    HWY_DASSERT(index < kMaxThreads);

    ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();

    threads_[index].store(ts, std::memory_order_release);

    return ts;

  // Single-threaded.

  void PrintResults() {

    const auto acq = std::memory_order_acquire;

    const size_t num_threads = num_threads_.load(acq);

    ThreadSpecific* main = threads_[0].load(acq);

    main->AnalyzeRemainingPackets();

    for (size_t i = 1; i < num_threads; ++i) {

      ThreadSpecific* ts = threads_[i].load(acq);

      ts->AnalyzeRemainingPackets();

      main->GetResults().Assimilate(ts->GetResults());

    if (num_threads != 0) {

      main->GetResults().Print();

 private:

  // Owning pointers.

  alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];

  std::atomic<size_t> num_threads_{0};

};

// RAII zone enter/exit recorder constructed by the ZONE macro; also

// responsible for initializing ThreadSpecific.

class Zone {

 public:

  // "name" must be a string literal (see StringOrigin).

  HWY_NOINLINE explicit Zone(const char* name) {

    HWY_FENCE;

    ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();

    if (HWY_UNLIKELY(thread_specific == nullptr)) {

      // Ensure the CPU supports our timer.

      char cpu[100];

      if (!platform::HaveTimerStop(cpu)) {

        HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);

      thread_specific = StaticThreadSpecific() = Threads().Add(name);

      // Must happen after setting StaticThreadSpecific, because ComputeOverhead

      // also calls Zone().

      thread_specific->ComputeOverhead();

    // (Capture timestamp ASAP, not inside WriteEntry.)

    HWY_FENCE;

    const uint64_t timestamp = HWY_NAMESPACE::timer::Start();

    thread_specific->WriteEntry(name, timestamp);

  HWY_NOINLINE ~Zone() {

    HWY_FENCE;

    const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();

    StaticThreadSpecific()->WriteExit(timestamp);

    HWY_FENCE;

  // Call exactly once after all threads have exited all zones.

  static void PrintResults() { Threads().PrintResults(); }

 private:

  // Returns reference to the thread's ThreadSpecific pointer (initially null).

  // Function-local static avoids needing a separate definition.

  static ThreadSpecific*& StaticThreadSpecific() {

    static thread_local ThreadSpecific* thread_specific;

    return thread_specific;

  // Returns the singleton ThreadList. Non time-critical.

  static ThreadList& Threads() {

    static ThreadList threads_;

    return threads_;

};

// Creates a zone starting from here until the end of the current scope.

// Timestamps will be recorded when entering and exiting the zone.

// "name" must be a string literal, which is ensured by merging with "".

#define PROFILER_ZONE(name)      \

  HWY_FENCE;                     \

  const hwy::Zone zone("" name); \

  HWY_FENCE

// Creates a zone for an entire function (when placed at its beginning).

// Shorter/more convenient than ZONE.

#define PROFILER_FUNC             \

  HWY_FENCE;                      \

  const hwy::Zone zone(__func__); \

  HWY_FENCE

#define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults

inline void ThreadSpecific::ComputeOverhead() {

  namespace hn = HWY_NAMESPACE;

  // Delay after capturing timestamps before/after the actual zone runs. Even

  // with frequency throttling disabled, this has a multimodal distribution,

  // including 32, 34, 48, 52, 59, 62.

  uint64_t self_overhead;

    const size_t kNumSamples = 32;

    uint32_t samples[kNumSamples];

    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {

      const size_t kNumDurations = 1024;

      uint32_t durations[kNumDurations];

      for (size_t idx_duration = 0; idx_duration < kNumDurations;

           ++idx_duration) {

          PROFILER_ZONE("Dummy Zone (never shown)");

        const uint64_t duration = results_.ZoneDuration(buffer_);

        buffer_size_ = 0;

        durations[idx_duration] = static_cast<uint32_t>(duration);

        HWY_DASSERT(num_packets_ == 0);

      robust_statistics::CountingSort(durations, kNumDurations);

      samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);

    // Median.

    robust_statistics::CountingSort(samples, kNumSamples);

    self_overhead = samples[kNumSamples / 2];

    if (PROFILER_PRINT_OVERHEAD) {

      printf("Overhead: %zu\n", self_overhead);

    results_.SetSelfOverhead(self_overhead);

  // Delay before capturing start timestamp / after end timestamp.

  const size_t kNumSamples = 32;

  uint32_t samples[kNumSamples];

  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {

    const size_t kNumDurations = 16;

    uint32_t durations[kNumDurations];

    for (size_t idx_duration = 0; idx_duration < kNumDurations;

         ++idx_duration) {

      const size_t kReps = 10000;

      // Analysis time should not be included => must fit within buffer.

      HWY_DASSERT(kReps * 2 < max_packets_);

      std::atomic_thread_fence(std::memory_order_seq_cst);

      const uint64_t t0 = hn::timer::Start();

      for (size_t i = 0; i < kReps; ++i) {

        PROFILER_ZONE("Dummy");

      FlushStream();

      const uint64_t t1 = hn::timer::Stop();

      HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);

      buffer_size_ = 0;

      num_packets_ = 0;

      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;

      durations[idx_duration] =

          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));

    robust_statistics::CountingSort(durations, kNumDurations);

    samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);

  robust_statistics::CountingSort(samples, kNumSamples);

  const uint64_t child_overhead = samples[9 * kNumSamples / 10];

  if (PROFILER_PRINT_OVERHEAD) {

    printf("Child overhead: %zu\n", child_overhead);

  results_.SetChildOverhead(child_overhead);

#pragma pack(pop)

}  // namespace hwy

#endif  // PROFILER_ENABLED || HWY_IDE

#if !PROFILER_ENABLED && !HWY_IDE

#define PROFILER_ZONE(name)

#define PROFILER_FUNC

#define PROFILER_PRINT_RESULTS()

#endif

#endif  // HIGHWAY_HWY_PROFILER_H_