MacroAssembler-x86-shared-SIMD.cpp

mozilla-central/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: JavaScript Engine: JIT

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-

 * vim: set ts=8 sts=2 et sw=2 tw=80:

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "jit/MacroAssembler.h"

#include "jit/x86-shared/MacroAssembler-x86-shared.h"

#include "jit/MacroAssembler-inl.h"

using namespace js;

using namespace js::jit;

using mozilla::DebugOnly;

using mozilla::FloatingPoint;

using mozilla::Maybe;

using mozilla::SpecificNaN;

void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(input, output);

  if (HasAVX2()) {

    vbroadcastb(Operand(output), output);

    return;

  vpxor(scratch, scratch, scratch);

  vpshufb(scratch, output, output);

void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {

  vmovd(input, output);

  if (HasAVX2()) {

    vbroadcastw(Operand(output), output);

    return;

  vpshuflw(0, output, output);

  vpshufd(0, output, output);

void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {

  vmovd(input, output);

  if (HasAVX2()) {

    vbroadcastd(Operand(output), output);

    return;

  vpshufd(0, output, output);

void MacroAssemblerX86Shared::splatX4(FloatRegister input,

                                      FloatRegister output) {

  MOZ_ASSERT(input.isSingle() && output.isSimd128());

  if (HasAVX2()) {

    vbroadcastss(Operand(input), output);

    return;

  input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output);

  vshufps(0, input, input, output);

void MacroAssemblerX86Shared::splatX2(FloatRegister input,

                                      FloatRegister output) {

  MOZ_ASSERT(input.isDouble() && output.isSimd128());

  vmovddup(Operand(input.asSimd128()), output);

void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,

                                                 Register output,

                                                 unsigned lane) {

  if (lane == 0) {

    // The value we want to extract is in the low double-word

    moveLowInt32(input, output);

  } else {

    vpextrd(lane, input, output);

void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,

                                                   FloatRegister output,

                                                   unsigned lane) {

  MOZ_ASSERT(input.isSimd128() && output.isSingle());

  if (lane == 0) {

    // The value we want to extract is in the low double-word

    if (input.asSingle() != output) {

      moveFloat32(input, output);

  } else if (lane == 2) {

    moveHighPairToLowPairFloat32(input, output);

  } else {

    uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);

    FloatRegister dest = output.asSimd128();

    input = moveSimd128FloatIfNotAVX(input, dest);

    vshufps(mask, input, input, dest);

void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,

                                                   FloatRegister output,

                                                   unsigned lane) {

  MOZ_ASSERT(input.isSimd128() && output.isDouble());

  if (lane == 0) {

    // The value we want to extract is in the low quadword

    if (input.asDouble() != output) {

      moveDouble(input, output);

  } else {

    vpalignr(Operand(input), output, output, 8);

void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,

                                                 Register output, unsigned lane,

                                                 SimdSign sign) {

  vpextrw(lane, input, Operand(output));

  if (sign == SimdSign::Signed) {

    movswl(output, output);

void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,

                                                 Register output, unsigned lane,

                                                 SimdSign sign) {

  vpextrb(lane, input, Operand(output));

  if (sign == SimdSign::Signed) {

    if (!AllocatableGeneralRegisterSet(Registers::SingleByteRegs).has(output)) {

      xchgl(eax, output);

      movsbl(eax, eax);

      xchgl(eax, output);

    } else {

      movsbl(output, output);

void MacroAssemblerX86Shared::replaceLaneFloat32x4(unsigned lane,

                                                   FloatRegister lhs,

                                                   FloatRegister rhs,

                                                   FloatRegister dest) {

  MOZ_ASSERT(lhs.isSimd128() && rhs.isSingle());

  if (lane == 0) {

    if (rhs.asSimd128() == lhs) {

      // no-op, although this should not normally happen for type checking

      // reasons higher up in the stack.

      moveSimd128Float(lhs, dest);

    } else {

      // move low dword of value into low dword of output

      vmovss(rhs, lhs, dest);

  } else {

    vinsertps(vinsertpsMask(0, lane), rhs, lhs, dest);

void MacroAssemblerX86Shared::replaceLaneFloat64x2(unsigned lane,

                                                   FloatRegister lhs,

                                                   FloatRegister rhs,

                                                   FloatRegister dest) {

  MOZ_ASSERT(lhs.isSimd128() && rhs.isDouble());

  if (lane == 0) {

    if (rhs.asSimd128() == lhs) {

      // no-op, although this should not normally happen for type checking

      // reasons higher up in the stack.

      moveSimd128Float(lhs, dest);

    } else {

      // move low qword of value into low qword of output

      vmovsd(rhs, lhs, dest);

  } else {

    // move low qword of value into high qword of output

    vshufpd(0, rhs, lhs, dest);

void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister output,

                                           FloatRegister temp,

                                           const uint8_t lanes[16]) {

  asMasm().loadConstantSimd128Int(

      SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp);

  vpblendvb(temp, rhs, lhs, output);

void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister output,

                                           const uint16_t lanes[8]) {

  uint32_t mask = 0;

  for (unsigned i = 0; i < 8; i++) {

    if (lanes[i]) {

      mask |= (1 << i);

  vpblendw(mask, rhs, lhs, output);

void MacroAssemblerX86Shared::laneSelectSimd128(FloatRegister mask,

                                                FloatRegister lhs,

                                                FloatRegister rhs,

                                                FloatRegister output) {

  vpblendvb(mask, lhs, rhs, output);

void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs,

                                             FloatRegister rhs,

                                             FloatRegister output,

                                             const uint8_t lanes[16]) {

  ScratchSimd128Scope scratch(asMasm());

  // Use pshufb instructions to gather the lanes from each source vector.

  // A negative index creates a zero lane, so the two vectors can be combined.

  // Set scratch = lanes from rhs.

  int8_t idx[16];

  for (unsigned i = 0; i < 16; i++) {

    idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;

  rhs = moveSimd128IntIfNotAVX(rhs, scratch);

  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), rhs, scratch);

  // Set output = lanes from lhs.

  for (unsigned i = 0; i < 16; i++) {

    idx[i] = lanes[i] < 16 ? lanes[i] : -1;

  lhs = moveSimd128IntIfNotAVX(lhs, output);

  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), lhs, output);

  // Combine.

  vpor(scratch, output, output);

static inline FloatRegister ToSimdFloatRegister(const Operand& op) {

  return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);

void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,

                                             Assembler::Condition cond,

                                             FloatRegister output) {

  switch (cond) {

    case Assembler::Condition::GreaterThan:

      vpcmpgtb(rhs, lhs, output);

      break;

    case Assembler::Condition::Equal:

      vpcmpeqb(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThan: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtb(Operand(lhs), output, output);

      break;

    case Assembler::Condition::NotEqual:

      vpcmpeqb(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtb(Operand(lhs), output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.

      vpcmpgtb(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Above:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminub(rhs, lhs, output);

        vpcmpeqb(Operand(lhs), output, output);

      } else {

        vpmaxub(rhs, lhs, output);

        vpcmpeqb(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::BelowOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminub(rhs, lhs, output);

        vpcmpeqb(Operand(lhs), output, output);

      } else {

        vpmaxub(rhs, lhs, output);

        vpcmpeqb(rhs, output, output);

      break;

    case Assembler::Below:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxub(rhs, lhs, output);

        vpcmpeqb(Operand(lhs), output, output);

      } else {

        vpminub(rhs, lhs, output);

        vpcmpeqb(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::AboveOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxub(rhs, lhs, output);

        vpcmpeqb(Operand(lhs), output, output);

      } else {

        vpminub(rhs, lhs, output);

        vpcmpeqb(rhs, output, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond,

                                             FloatRegister lhs,

                                             const SimdConstant& rhs,

                                             FloatRegister dest) {

  bool complement = false;

  switch (cond) {

    case Assembler::Condition::NotEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::Equal:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqb,

                    &MacroAssembler::vpcmpeqbSimd128);

      break;

    case Assembler::Condition::LessThanOrEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::GreaterThan:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtb,

                    &MacroAssembler::vpcmpgtbSimd128);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

  if (complement) {

    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);

void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,

                                             Assembler::Condition cond,

                                             FloatRegister output) {

  switch (cond) {

    case Assembler::Condition::GreaterThan:

      vpcmpgtw(rhs, lhs, output);

      break;

    case Assembler::Condition::Equal:

      vpcmpeqw(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThan: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtw(Operand(lhs), output, output);

      break;

    case Assembler::Condition::NotEqual:

      vpcmpeqw(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtw(Operand(lhs), output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.

      vpcmpgtw(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Above:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminuw(rhs, lhs, output);

        vpcmpeqw(Operand(lhs), output, output);

      } else {

        vpmaxuw(rhs, lhs, output);

        vpcmpeqw(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::BelowOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminuw(rhs, lhs, output);

        vpcmpeqw(Operand(lhs), output, output);

      } else {

        vpmaxuw(rhs, lhs, output);

        vpcmpeqw(rhs, output, output);

      break;

    case Assembler::Below:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxuw(rhs, lhs, output);

        vpcmpeqw(Operand(lhs), output, output);

      } else {

        vpminuw(rhs, lhs, output);

        vpcmpeqw(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::AboveOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxuw(rhs, lhs, output);

        vpcmpeqw(Operand(lhs), output, output);

      } else {

        vpminuw(rhs, lhs, output);

        vpcmpeqw(rhs, output, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond,

                                             FloatRegister lhs,

                                             const SimdConstant& rhs,

                                             FloatRegister dest) {

  bool complement = false;

  switch (cond) {

    case Assembler::Condition::NotEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::Equal:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqw,

                    &MacroAssembler::vpcmpeqwSimd128);

      break;

    case Assembler::Condition::LessThanOrEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::GreaterThan:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtw,

                    &MacroAssembler::vpcmpgtwSimd128);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

  if (complement) {

    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);

void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,

                                             Assembler::Condition cond,

                                             FloatRegister output) {

  switch (cond) {

    case Assembler::Condition::GreaterThan:

      vpcmpgtd(rhs, lhs, output);

      break;

    case Assembler::Condition::Equal:

      vpcmpeqd(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThan: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtd(Operand(lhs), output, output);

      break;

    case Assembler::Condition::NotEqual:

      vpcmpeqd(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual: {

      ScratchSimd128Scope scratch(asMasm());

      if (lhs == output) {

        moveSimd128Int(lhs, scratch);

        lhs = scratch;

      if (rhs.kind() == Operand::FPREG) {

        moveSimd128Int(ToSimdFloatRegister(rhs), output);

      } else {

        loadAlignedSimd128Int(rhs, output);

      vpcmpgtd(Operand(lhs), output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.

      vpcmpgtd(rhs, lhs, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::Above:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminud(rhs, lhs, output);

        vpcmpeqd(Operand(lhs), output, output);

      } else {

        vpmaxud(rhs, lhs, output);

        vpcmpeqd(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::BelowOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpminud(rhs, lhs, output);

        vpcmpeqd(Operand(lhs), output, output);

      } else {

        vpmaxud(rhs, lhs, output);

        vpcmpeqd(rhs, output, output);

      break;

    case Assembler::Below:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxud(rhs, lhs, output);

        vpcmpeqd(Operand(lhs), output, output);

      } else {

        vpminud(rhs, lhs, output);

        vpcmpeqd(rhs, output, output);

      asMasm().bitwiseNotSimd128(output, output);

      break;

    case Assembler::AboveOrEqual:

      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {

        vpmaxud(rhs, lhs, output);

        vpcmpeqd(Operand(lhs), output, output);

      } else {

        vpminud(rhs, lhs, output);

        vpcmpeqd(rhs, output, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond,

                                             FloatRegister lhs,

                                             const SimdConstant& rhs,

                                             FloatRegister dest) {

  bool complement = false;

  switch (cond) {

    case Assembler::Condition::NotEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::Equal:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqd,

                    &MacroAssembler::vpcmpeqdSimd128);

      break;

    case Assembler::Condition::LessThanOrEqual:

      complement = true;

      [[fallthrough]];

    case Assembler::Condition::GreaterThan:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtd,

                    &MacroAssembler::vpcmpgtdSimd128);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

  if (complement) {

    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);

void MacroAssemblerX86Shared::compareForEqualityInt64x2(

    FloatRegister lhs, Operand rhs, Assembler::Condition cond,

    FloatRegister output) {

  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);

  switch (cond) {

    case Assembler::Condition::Equal:

      vpcmpeqq(rhs, lhs, output);

      break;

    case Assembler::Condition::NotEqual:

      vpcmpeqq(rhs, lhs, output);

      asMasm().bitwiseXorSimd128(output, allOnes, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareForOrderingInt64x2(

    FloatRegister lhs, Operand rhs, Assembler::Condition cond,

    FloatRegister temp1, FloatRegister temp2, FloatRegister output) {

  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);

  // The pseudo code is for (e.g. > comparison):

  //  __m128i pcmpgtq_sse2 (__m128i a, __m128i b) {

  //    __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a));

  //    r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b));

  //    return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1));

  //  }

  // Credits to https://stackoverflow.com/a/65175746

  switch (cond) {

    case Assembler::Condition::GreaterThan:

      vmovdqa(rhs, temp1);

      vmovdqa(Operand(lhs), temp2);

      vpsubq(Operand(lhs), temp1, temp1);

      vpcmpeqd(rhs, temp2, temp2);

      vandpd(temp2, temp1, temp1);

      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);

      vpcmpgtd(rhs, lhs, output);

      vpor(Operand(temp1), output, output);

      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);

      break;

    case Assembler::Condition::LessThan:

      vmovdqa(rhs, temp1);

      vmovdqa(Operand(lhs), temp2);

      vpcmpgtd(Operand(lhs), temp1, temp1);

      vpcmpeqd(Operand(rhs), temp2, temp2);

      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);

      vpsubq(rhs, lhs, output);

      vandpd(temp2, output, output);

      vpor(Operand(temp1), output, output);

      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual:

      vmovdqa(rhs, temp1);

      vmovdqa(Operand(lhs), temp2);

      vpcmpgtd(Operand(lhs), temp1, temp1);

      vpcmpeqd(Operand(rhs), temp2, temp2);

      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);

      vpsubq(rhs, lhs, output);

      vandpd(temp2, output, output);

      vpor(Operand(temp1), output, output);

      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);

      asMasm().bitwiseXorSimd128(output, allOnes, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      vmovdqa(rhs, temp1);

      vmovdqa(Operand(lhs), temp2);

      vpsubq(Operand(lhs), temp1, temp1);

      vpcmpeqd(rhs, temp2, temp2);

      vandpd(temp2, temp1, temp1);

      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);

      vpcmpgtd(rhs, lhs, output);

      vpor(Operand(temp1), output, output);

      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);

      asMasm().bitwiseXorSimd128(output, allOnes, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareForOrderingInt64x2AVX(

    FloatRegister lhs, FloatRegister rhs, Assembler::Condition cond,

    FloatRegister output) {

  MOZ_ASSERT(HasSSE42());

  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);

  switch (cond) {

    case Assembler::Condition::GreaterThan:

      vpcmpgtq(Operand(rhs), lhs, output);

      break;

    case Assembler::Condition::LessThan:

      vpcmpgtq(Operand(lhs), rhs, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual:

      vpcmpgtq(Operand(lhs), rhs, output);

      asMasm().bitwiseXorSimd128(output, allOnes, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      vpcmpgtq(Operand(rhs), lhs, output);

      asMasm().bitwiseXorSimd128(output, allOnes, output);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,

                                               Assembler::Condition cond,

                                               FloatRegister output) {

  // TODO Can do better here with three-address compares

  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.

  // This is bad, but Ion does not need this fixup.

  ScratchSimd128Scope scratch(asMasm());

  if (!HasAVX() && !lhs.aliases(output)) {

    if (rhs.kind() == Operand::FPREG &&

        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {

      vmovaps(rhs, scratch);

      rhs = Operand(scratch);

    vmovaps(lhs, output);

    lhs = output;

  switch (cond) {

    case Assembler::Condition::Equal:

      vcmpeqps(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThan:

      vcmpltps(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      vcmpleps(rhs, lhs, output);

      break;

    case Assembler::Condition::NotEqual:

      vcmpneqps(rhs, lhs, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual:

    case Assembler::Condition::GreaterThan:

      // We reverse these operations in the -inl.h file so that we don't have to

      // copy into and out of temporaries after codegen.

      MOZ_CRASH("should have reversed this");

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond,

                                               FloatRegister lhs,

                                               const SimdConstant& rhs,

                                               FloatRegister dest) {

  switch (cond) {

    case Assembler::Condition::Equal:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqps,

                    &MacroAssembler::vcmpeqpsSimd128);

      break;

    case Assembler::Condition::LessThan:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltps,

                    &MacroAssembler::vcmpltpsSimd128);

      break;

    case Assembler::Condition::LessThanOrEqual:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpleps,

                    &MacroAssembler::vcmplepsSimd128);

      break;

    case Assembler::Condition::NotEqual:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqps,

                    &MacroAssembler::vcmpneqpsSimd128);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,

                                               Assembler::Condition cond,

                                               FloatRegister output) {

  // TODO Can do better here with three-address compares

  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.

  // This is bad, but Ion does not need this fixup.

  ScratchSimd128Scope scratch(asMasm());

  if (!HasAVX() && !lhs.aliases(output)) {

    if (rhs.kind() == Operand::FPREG &&

        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {

      vmovapd(rhs, scratch);

      rhs = Operand(scratch);

    vmovapd(lhs, output);

    lhs = output;

  switch (cond) {

    case Assembler::Condition::Equal:

      vcmpeqpd(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThan:

      vcmpltpd(rhs, lhs, output);

      break;

    case Assembler::Condition::LessThanOrEqual:

      vcmplepd(rhs, lhs, output);

      break;

    case Assembler::Condition::NotEqual:

      vcmpneqpd(rhs, lhs, output);

      break;

    case Assembler::Condition::GreaterThanOrEqual:

    case Assembler::Condition::GreaterThan:

      // We reverse these operations in the -inl.h file so that we don't have to

      // copy into and out of temporaries after codegen.

      MOZ_CRASH("should have reversed this");

    default:

      MOZ_CRASH("unexpected condition op");

void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond,

                                               FloatRegister lhs,

                                               const SimdConstant& rhs,

                                               FloatRegister dest) {

  switch (cond) {

    case Assembler::Condition::Equal:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqpd,

                    &MacroAssembler::vcmpeqpdSimd128);

      break;

    case Assembler::Condition::LessThan:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltpd,

                    &MacroAssembler::vcmpltpdSimd128);

      break;

    case Assembler::Condition::LessThanOrEqual:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmplepd,

                    &MacroAssembler::vcmplepdSimd128);

      break;

    case Assembler::Condition::NotEqual:

      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqpd,

                    &MacroAssembler::vcmpneqpdSimd128);

      break;

    default:

      MOZ_CRASH("unexpected condition op");

// Semantics of wasm max and min.

//

//  * -0 < 0

//  * If one input is NaN then that NaN is the output

//  * If both inputs are NaN then the output is selected nondeterministically

//  * Any returned NaN is always made quiet

//  * The MVP spec 2.2.3 says "No distinction is made between signalling and

//    quiet NaNs", suggesting SNaN inputs are allowed and should not fault

//

// Semantics of maxps/minps/maxpd/minpd:

//

//  * If the values are both +/-0 the rhs is returned

//  * If the rhs is SNaN then the rhs is returned

//  * If either value is NaN then the rhs is returned

//  * An SNaN operand does not appear to give rise to an exception, at least

//    not in the JS shell on Linux, though the Intel spec lists Invalid

//    as one of the possible exceptions

// Various unaddressed considerations:

//

// It's pretty insane for this to take an Operand rhs - it really needs to be

// a register, given the number of times we access it.

//

// Constant load can be folded into the ANDPS.  Do we care?  It won't save us

// any registers, since output/temp1/temp2/scratch are all live at the same time

// after the first instruction of the slow path.

//

// Can we use blend for the NaN extraction/insertion?  We'd need xmm0 for the

// mask, which is no fun.  But it would be lhs UNORD lhs -> mask, blend;

// rhs UNORD rhs -> mask; blend.  Better than the mess we have below.  But

// we'd still need to setup the QNaN bits, unless we can blend those too

// with the lhs UNORD rhs mask?

//

// If we could determine that both input lanes are NaN then the result of the

// fast path should be fine modulo the QNaN bits, but it's not obvious this is

// much of an advantage.

void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs,

                                              Operand rhs, FloatRegister temp1,

                                              FloatRegister temp2,

                                              FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  Label l;

  SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000)));

  /* clang-format off */ /* leave my comments alone */

  lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output);

  if (isMin) {

    vmovaps(lhs, output);                    // compute

    vminps(rhs, output, output);             //   min lhs, rhs

    vmovaps(rhs, temp1);                     // compute

    vminps(Operand(lhs), temp1, temp1);      //   min rhs, lhs

    vorps(temp1, output, output);            // fix min(-0, 0) with OR

  } else {

    vmovaps(lhs, output);                    // compute

    vmaxps(rhs, output, output);             //   max lhs, rhs

    vmovaps(rhs, temp1);                     // compute

    vmaxps(Operand(lhs), temp1, temp1);      //   max rhs, lhs

    vandps(temp1, output, output);           // fix max(-0, 0) with AND

  vmovaps(lhs, temp1);                       // compute

  vcmpunordps(rhs, temp1, temp1);            //   lhs UNORD rhs

  vptest(temp1, temp1);                      // check if any unordered

  j(Assembler::Equal, &l);                   //   and exit if not

  // Slow path.

  // output has result for non-NaN lanes, garbage in NaN lanes.

  // temp1 has lhs UNORD rhs.

  // temp2 is dead.

  vmovaps(temp1, temp2);                     // clear NaN lanes of result

  vpandn(output, temp2, temp2);              //   result now in temp2

  asMasm().vpandSimd128(quietBits, temp1, temp1);   // setup QNaN bits in NaN lanes

  vorps(temp1, temp2, temp2);                //   and OR into result

  vmovaps(lhs, temp1);                       // find NaN lanes

  vcmpunordps(Operand(temp1), temp1, temp1); //   in lhs

  vmovaps(temp1, output);                    //     (and save them for later)

  vandps(lhs, temp1, temp1);                 //       and extract the NaNs

  vorps(temp1, temp2, temp2);                //         and add to the result

  vmovaps(rhs, temp1);                       // find NaN lanes

  vcmpunordps(Operand(temp1), temp1, temp1); //   in rhs

  vpandn(temp1, output, output);             //     except if they were in lhs

  vandps(rhs, output, output);               //       and extract the NaNs

  vorps(temp2, output, output);              //         and add to the result

  bind(&l);

  /* clang-format on */

void MacroAssemblerX86Shared::minMaxFloat32x4AVX(bool isMin, FloatRegister lhs,

                                                 FloatRegister rhs,

                                                 FloatRegister temp1,

                                                 FloatRegister temp2,

                                                 FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  Label l;

  SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000)));

  /* clang-format off */ /* leave my comments alone */

  FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output);

  // Allow rhs be assigned to scratch when rhs == lhs and == output --

  // don't make a special case since the semantics require setup QNaN bits.

  FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output);

  if (isMin) {

    vminps(Operand(rhs), lhs, temp2);             // min lhs, rhs

    vminps(Operand(lhs), rhs, temp1);             // min rhs, lhs

    vorps(temp1, temp2, output);                  // fix min(-0, 0) with OR

  } else {

    vmaxps(Operand(rhs), lhs, temp2);             // max lhs, rhs

    vmaxps(Operand(lhs), rhs, temp1);             // max rhs, lhs

    vandps(temp1, temp2, output);                 // fix max(-0, 0) with AND

  vcmpunordps(Operand(rhsCopy), lhsCopy, temp1);  // lhs UNORD rhs

  vptest(temp1, temp1);                           // check if any unordered

  j(Assembler::Equal, &l);                        //   and exit if not

  // Slow path.

  // output has result for non-NaN lanes, garbage in NaN lanes.

  // temp1 has lhs UNORD rhs.

  // temp2 is dead.

  vcmpunordps(Operand(lhsCopy), lhsCopy, temp2);  // find NaN lanes in lhs

  vblendvps(temp2, lhsCopy, rhsCopy, temp2);      //   add other lines from rhs

  asMasm().vporSimd128(quietBits, temp2, temp2);  // setup QNaN bits in NaN lanes

  vblendvps(temp1, temp2, output, output);        // replace NaN lines from temp2

  bind(&l);

  /* clang-format on */

// Exactly as above.

void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs,

                                              Operand rhs, FloatRegister temp1,

                                              FloatRegister temp2,

                                              FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  Label l;

  SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull)));

  /* clang-format off */ /* leave my comments alone */

  lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output);

  if (isMin) {

    vmovapd(lhs, output);                    // compute

    vminpd(rhs, output, output);             //   min lhs, rhs

    vmovapd(rhs, temp1);                     // compute

    vminpd(Operand(lhs), temp1, temp1);      //   min rhs, lhs

    vorpd(temp1, output, output);            // fix min(-0, 0) with OR

  } else {

    vmovapd(lhs, output);                    // compute

    vmaxpd(rhs, output, output);             //   max lhs, rhs

    vmovapd(rhs, temp1);                     // compute

    vmaxpd(Operand(lhs), temp1, temp1);      //   max rhs, lhs

    vandpd(temp1, output, output);           // fix max(-0, 0) with AND

  vmovapd(lhs, temp1);                       // compute

  vcmpunordpd(rhs, temp1, temp1);                   //   lhs UNORD rhs

  vptest(temp1, temp1);                      // check if any unordered

  j(Assembler::Equal, &l);                   //   and exit if not

  // Slow path.

  // output has result for non-NaN lanes, garbage in NaN lanes.

  // temp1 has lhs UNORD rhs.

  // temp2 is dead.

  vmovapd(temp1, temp2);                     // clear NaN lanes of result

  vpandn(output, temp2, temp2);              //   result now in temp2

  asMasm().vpandSimd128(quietBits, temp1, temp1);   // setup QNaN bits in NaN lanes

  vorpd(temp1, temp2, temp2);                //   and OR into result

  vmovapd(lhs, temp1);                       // find NaN lanes

  vcmpunordpd(Operand(temp1), temp1, temp1);        //   in lhs

  vmovapd(temp1, output);                    //     (and save them for later)

  vandpd(lhs, temp1, temp1);                 //       and extract the NaNs

  vorpd(temp1, temp2, temp2);                //         and add to the result

  vmovapd(rhs, temp1);                       // find NaN lanes

  vcmpunordpd(Operand(temp1), temp1, temp1);        //   in rhs

  vpandn(temp1, output, output);             //     except if they were in lhs

  vandpd(rhs, output, output);               //       and extract the NaNs

  vorpd(temp2, output, output);              //         and add to the result

  bind(&l);

  /* clang-format on */

void MacroAssemblerX86Shared::minMaxFloat64x2AVX(bool isMin, FloatRegister lhs,

                                                 FloatRegister rhs,

                                                 FloatRegister temp1,

                                                 FloatRegister temp2,

                                                 FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  Label l;

  SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull)));

  /* clang-format off */ /* leave my comments alone */

  FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output);

  // Allow rhs be assigned to scratch when rhs == lhs and == output --

  // don't make a special case since the semantics require setup QNaN bits.

  FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output);

  if (isMin) {

    vminpd(Operand(rhs), lhs, temp2);             // min lhs, rhs

    vminpd(Operand(lhs), rhs, temp1);             // min rhs, lhs

    vorpd(temp1, temp2, output);                  // fix min(-0, 0) with OR

  } else {

    vmaxpd(Operand(rhs), lhs, temp2);             // max lhs, rhs

    vmaxpd(Operand(lhs), rhs, temp1);             // max rhs, lhs

    vandpd(temp1, temp2, output);                 // fix max(-0, 0) with AND

  vcmpunordpd(Operand(rhsCopy), lhsCopy, temp1);  // lhs UNORD rhs

  vptest(temp1, temp1);                           // check if any unordered

  j(Assembler::Equal, &l);                        //   and exit if not

  // Slow path.

  // output has result for non-NaN lanes, garbage in NaN lanes.

  // temp1 has lhs UNORD rhs.

  // temp2 is dead.

  vcmpunordpd(Operand(lhsCopy), lhsCopy, temp2);  // find NaN lanes in lhs

  vblendvpd(temp2, lhsCopy, rhsCopy, temp2);      //   add other lines from rhs

  asMasm().vporSimd128(quietBits, temp2, temp2);  // setup QNaN bits in NaN lanes

  vblendvpd(temp1, temp2, output, output);        // replace NaN lines from temp2

  bind(&l);

  /* clang-format on */

void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister temp1,

                                           FloatRegister temp2,

                                           FloatRegister output) {

  if (HasAVX()) {

    minMaxFloat32x4AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output);

    return;

  minMaxFloat32x4(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output);

void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister temp1,

                                           FloatRegister temp2,

                                           FloatRegister output) {

  if (HasAVX()) {

    minMaxFloat32x4AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output);

    return;

  minMaxFloat32x4(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output);

void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister temp1,

                                           FloatRegister temp2,

                                           FloatRegister output) {

  if (HasAVX()) {

    minMaxFloat64x2AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output);

    return;

  minMaxFloat64x2(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output);

void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,

                                           FloatRegister temp1,

                                           FloatRegister temp2,

                                           FloatRegister output) {

  if (HasAVX()) {

    minMaxFloat64x2AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output);

    return;

  minMaxFloat64x2(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output);

void MacroAssemblerX86Shared::packedShiftByScalarInt8x16(

    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest,

    void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,

                                           FloatRegister),

    void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  // High bytes

  vpalignr(Operand(in), xtmp, xtmp, 8);

  (this->*extend)(Operand(xtmp), xtmp);

  (this->*shift)(scratch, xtmp, xtmp);

  // Low bytes

  (this->*extend)(Operand(dest), dest);

  (this->*shift)(scratch, dest, dest);

  // Mask off garbage to avoid saturation during packing

  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)),

                                  scratch);

  vpand(Operand(scratch), xtmp, xtmp);

  vpand(Operand(scratch), dest, dest);

  vpackuswb(Operand(xtmp), dest, dest);

void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(

    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {

  packedShiftByScalarInt8x16(in, count, xtmp, dest,

                             &MacroAssemblerX86Shared::vpsllw,

                             &MacroAssemblerX86Shared::vpmovzxbw);

void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(

    Imm32 count, FloatRegister src, FloatRegister dest) {

  MOZ_ASSERT(count.value <= 7);

  if (MOZ_UNLIKELY(count.value == 0)) {

    moveSimd128Int(src, dest);

    return;

  src = asMasm().moveSimd128IntIfNotAVX(src, dest);

  // Use the doubling trick for low shift counts, otherwise mask off the bits

  // that are shifted out of the low byte of each word and use word shifts.  The

  // optimal cutoff remains to be explored.

  if (count.value <= 3) {

    vpaddb(Operand(src), src, dest);

    for (int32_t shift = count.value - 1; shift > 0; --shift) {

      vpaddb(Operand(dest), dest, dest);

  } else {

    asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value),

                               dest);

    vpsllw(count, dest, dest);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(

    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {

  packedShiftByScalarInt8x16(in, count, xtmp, dest,

                             &MacroAssemblerX86Shared::vpsraw,

                             &MacroAssemblerX86Shared::vpmovsxbw);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(

    Imm32 count, FloatRegister src, FloatRegister dest) {

  MOZ_ASSERT(count.value <= 7);

  ScratchSimd128Scope scratch(asMasm());

  vpunpckhbw(src, scratch, scratch);

  vpunpcklbw(src, dest, dest);

  vpsraw(Imm32(count.value + 8), scratch, scratch);

  vpsraw(Imm32(count.value + 8), dest, dest);

  vpacksswb(Operand(scratch), dest, dest);

void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(

    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {

  packedShiftByScalarInt8x16(in, count, xtmp, dest,

                             &MacroAssemblerX86Shared::vpsrlw,

                             &MacroAssemblerX86Shared::vpmovzxbw);

void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(

    Imm32 count, FloatRegister src, FloatRegister dest) {

  MOZ_ASSERT(count.value <= 7);

  src = asMasm().moveSimd128IntIfNotAVX(src, dest);

  asMasm().bitwiseAndSimd128(

      src, SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest);

  vpsrlw(count, dest, dest);

void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsllw(scratch, in, dest);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsraw(scratch, in, dest);

void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsrlw(scratch, in, dest);

void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpslld(scratch, in, dest);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsrad(scratch, in, dest);

void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsrld(scratch, in, dest);

void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsllq(scratch, in, dest);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(

    FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, temp);

  asMasm().signReplicationInt64x2(in, scratch);

  in = asMasm().moveSimd128FloatIfNotAVX(in, dest);

  // Invert if negative, shift all, invert back if negative.

  vpxor(Operand(scratch), in, dest);

  vpsrlq(temp, dest, dest);

  vpxor(Operand(scratch), dest, dest);

void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(

    FloatRegister in, Register count, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  vmovd(count, scratch);

  vpsrlq(scratch, in, dest);

void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(

    Imm32 count, FloatRegister src, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  asMasm().signReplicationInt64x2(src, scratch);

  // Invert if negative, shift all, invert back if negative.

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  vpxor(Operand(scratch), src, dest);

  vpsrlq(Imm32(count.value & 63), dest, dest);

  vpxor(Operand(scratch), dest, dest);

void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,

                                            FloatRegister onTrue,

                                            FloatRegister onFalse,

                                            FloatRegister temp,

                                            FloatRegister output) {

  // Normally the codegen will attempt to enforce these register assignments so

  // that the moves are avoided.

  onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output);

  if (MOZ_UNLIKELY(mask == onTrue)) {

    vpor(Operand(onFalse), onTrue, output);

    return;

  mask = asMasm().moveSimd128IntIfNotAVX(mask, temp);

  vpand(Operand(mask), onTrue, output);

  vpandn(Operand(onFalse), mask, temp);

  vpor(Operand(temp), output, output);

// Code sequences for int32x4<->float32x4 culled from v8; commentary added.

void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(

    FloatRegister src, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  src = asMasm().moveSimd128IntIfNotAVX(src, dest);

  vpxor(Operand(scratch), scratch, scratch);  // extract low bits

  vpblendw(0x55, src, scratch, scratch);      //   into scratch

  vpsubd(Operand(scratch), src, dest);        //     and high bits into dest

  vcvtdq2ps(scratch, scratch);                // convert low bits

  vpsrld(Imm32(1), dest, dest);               // get high into unsigned range

  vcvtdq2ps(dest, dest);                      //   convert

  vaddps(Operand(dest), dest, dest);          //     and back into signed

  vaddps(Operand(scratch), dest, dest);       // combine high+low: may round

void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,

                                                         FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out

  // of range values as we need it to.  We want to saturate too-large positive

  // values to 7FFFFFFFh and too-large negative values to 80000000h.  NaN and -0

  // become 0.

  // Convert NaN to 0 by masking away values that compare unordered to itself.

  if (HasAVX()) {

    vcmpeqps(Operand(src), src, scratch);

    vpand(Operand(scratch), src, dest);

  } else {

    vmovaps(src, scratch);

    vcmpeqps(Operand(scratch), scratch, scratch);

    moveSimd128Float(src, dest);

    vpand(Operand(scratch), dest, dest);

  // Make lanes in scratch == 0xFFFFFFFFh, if dest overflows during cvttps2dq,

  // otherwise 0.

  static const SimdConstant minOverflowedInt =

      SimdConstant::SplatX4(2147483648.f);

  if (HasAVX()) {

    asMasm().vcmpgepsSimd128(minOverflowedInt, dest, scratch);

  } else {

    asMasm().loadConstantSimd128Float(minOverflowedInt, scratch);

    vcmpleps(Operand(dest), scratch, scratch);

  // Convert.  This will make the output 80000000h if the input is out of range.

  vcvttps2dq(dest, dest);

  // Convert overflow lanes to 0x7FFFFFFF.

  vpxor(Operand(scratch), dest, dest);

void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(

    FloatRegister src, FloatRegister temp, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out

  // of range values as we need it to.  We want to saturate too-large positive

  // values to FFFFFFFFh and negative values to zero.  NaN and -0 become 0.

  // Convert NaN and negative values to zeroes in dest.

  vxorps(Operand(scratch), scratch, scratch);

  vmaxps(Operand(scratch), src, dest);

  // Place the largest positive signed integer in all lanes in scratch.

  // We use it to bias the conversion to handle edge cases.

  asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f),

                                    scratch);

  // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned

  // range but above the signed range into the signed range; 0 => -7FFFFFFFh.

  vmovaps(dest, temp);

  vsubps(Operand(scratch), temp, temp);

  // scratch = mask of biased values that are greater than 7FFFFFFFh.

  vcmpleps(Operand(temp), scratch, scratch);

  // Convert the biased values to integer.  Positive values above 7FFFFFFFh will

  // have been converted to 80000000h, all others become the expected integer.

  vcvttps2dq(temp, temp);

  // As lanes of scratch are ~0 where the result overflows, this computes

  // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes

  // untouched as the biased integer.

  vpxor(Operand(scratch), temp, temp);

  // Convert negative biased lanes in temp to zero.  After this, temp will be

  // zero where the result should be zero or is less than 80000000h, 7FFFFFFF

  // where the result overflows, and will have the converted biased result in

  // other lanes (for input values >= 80000000h).

  vpxor(Operand(scratch), scratch, scratch);

  vpmaxsd(Operand(scratch), temp, temp);

  // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will

  // be what they should be.

  vcvttps2dq(dest, dest);

  // Add temp to the result.  Overflow lanes with 80000000h becomes FFFFFFFFh,

  // biased high-value unsigned lanes become unbiased, everything else is left

  // unchanged.

  vpaddd(Operand(temp), dest, dest);

void MacroAssemblerX86Shared::unsignedTruncFloat32x4ToInt32x4Relaxed(

    FloatRegister src, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  // Place lanes below 80000000h into dest, otherwise into scratch.

  // Keep dest or scratch 0 as default.

  asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch);

  vcmpltps(Operand(src), scratch, scratch);

  vpand(Operand(src), scratch, scratch);

  vpxor(Operand(scratch), src, dest);

  // Convert lanes below 80000000h into unsigned int without issues.

  vcvttps2dq(dest, dest);

  // Knowing IEEE-754 number representation: convert lanes above 7FFFFFFFh,

  // mutiply by 2 (to add 1 in exponent) and shift to the left by 8 bits.

  vaddps(Operand(scratch), scratch, scratch);

  vpslld(Imm32(8), scratch, scratch);

  // Combine the results.

  vpaddd(Operand(scratch), dest, dest);

void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat64x2(

    FloatRegister src, FloatRegister dest) {

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  asMasm().vunpcklpsSimd128(SimdConstant::SplatX4(0x43300000), src, dest);

  asMasm().vsubpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest);

void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src,

                                                         FloatRegister temp,

                                                         FloatRegister dest) {

  FloatRegister srcForTemp = asMasm().moveSimd128FloatIfNotAVX(src, temp);

  vcmpeqpd(Operand(srcForTemp), srcForTemp, temp);

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  asMasm().vandpdSimd128(SimdConstant::SplatX2(2147483647.0), temp, temp);

  vminpd(Operand(temp), src, dest);

  vcvttpd2dq(dest, dest);

void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(

    FloatRegister src, FloatRegister temp, FloatRegister dest) {

  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  vxorpd(temp, temp, temp);

  vmaxpd(Operand(temp), src, dest);

  asMasm().vminpdSimd128(SimdConstant::SplatX2(4294967295.0), dest, dest);

  vroundpd(SSERoundingMode::Trunc, Operand(dest), dest);

  asMasm().vaddpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest);

  // temp == 0

  vshufps(0x88, temp, dest, dest);

void MacroAssemblerX86Shared::unsignedTruncFloat64x2ToInt32x4Relaxed(

    FloatRegister src, FloatRegister dest) {

  ScratchSimd128Scope scratch(asMasm());

  // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN

  // and out-of-bounds checks.

  vroundpd(SSERoundingMode::Trunc, Operand(src), dest);

  asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0),

                                    scratch);

  vaddpd(Operand(scratch), dest, dest);

  // The scratch has zeros in f32x4 lanes with index 0 and 2. The in-memory

  // representantation of the splatted double contantant contains zero in its

  // low bits.

  vshufps(0x88, scratch, dest, dest);

void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,

                                            FloatRegister temp,

                                            FloatRegister output) {

  ScratchSimd128Scope scratch(asMasm());

  asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch);

  FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp);

  vpand(scratch, srcForTemp, temp);

  vpandn(src, scratch, scratch);

  int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};

  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);

  vpsrlw(Imm32(4), scratch, scratch);

  vpshufb(temp, output, output);

  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), temp);

  vpshufb(scratch, temp, temp);

  vpaddb(Operand(temp), output, output);