texture.h - mozsearch

mozilla-central/gfx/wr/swgl/src/texture.h (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: WebRender

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

namespace glsl {

using PackedRGBA8 = V16<uint8_t>;

using WideRGBA8 = V16<uint16_t>;

using HalfRGBA8 = V8<uint16_t>;

SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }

template <int N>

UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) {

  typedef VectorType<uint8_t, N> packed_type;

  // Generic conversions only mask off the low byte without actually clamping

  // like a real pack. First force the word to all 1s if it overflows, and then

  // add on the sign bit to cause it to roll over to 0 if it was negative.

  p = (p | (p > 255)) + (p >> 15);

  return CONVERT(p, packed_type);

SI PackedRGBA8 pack(WideRGBA8 p) {

#if USE_SSE2

  return _mm_packus_epi16(lowHalf(p), highHalf(p));

#elif USE_NEON

  return vcombine_u8(vqmovun_s16(bit_cast<V8<int16_t>>(lowHalf(p))),

                     vqmovun_s16(bit_cast<V8<int16_t>>(highHalf(p))));

#else

  return genericPackWide(p);

#endif

using PackedR8 = V4<uint8_t>;

using WideR8 = V4<uint16_t>;

SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }

SI PackedR8 pack(WideR8 p) {

#if USE_SSE2

  auto m = expand(p);

  auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));

  return SHUFFLE(r, r, 0, 1, 2, 3);

#elif USE_NEON

  return lowHalf(

      bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(expand(p)))));

#else

  return genericPackWide(p);

#endif

using PackedRG8 = V8<uint8_t>;

using WideRG8 = V8<uint16_t>;

SI PackedRG8 pack(WideRG8 p) {

#if USE_SSE2

  return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p)));

#elif USE_NEON

  return bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(p)));

#else

  return genericPackWide(p);

#endif

SI I32 clampCoord(I32 coord, int limit, int base = 0) {

#if USE_SSE2

  return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)),

                       _mm_set1_epi32(limit - 1));

#else

  return clamp(coord, base, limit - 1);

#endif

SI int clampCoord(int coord, int limit, int base = 0) {

  return min(max(coord, base), limit - 1);

template <typename T, typename S>

SI T clamp2D(T P, S sampler) {

  return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};

SI float to_float(uint32_t x) { return x * (1.f / 255.f); }

SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {

  U32 pixels = {a, b, c, d};

  return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),

              cast(pixels & 0xFF), cast(pixels >> 24)) *

         (1.0f / 255.0f);

SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {

  return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},

              Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});

SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {

  return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},

               I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});

SI vec4_scalar pixel_to_vec4(uint32_t p) {

  U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};

  Float f = cast(i) * (1.0f / 255.0f);

  return vec4_scalar(f.x, f.y, f.z, f.w);

template <typename S>

SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {

  return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],

                       sampler->buf[offset.z], sampler->buf[offset.w]);

template <typename S>

vec4 texelFetchRGBA8(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return fetchOffsetsRGBA8(sampler, offset);

template <typename S>

SI Float fetchOffsetsR8(S sampler, I32 offset) {

  U32 i = {

      ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],

      ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};

  return cast(i) * (1.0f / 255.0f);

template <typename S>

vec4 texelFetchR8(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);

template <typename S>

SI vec4 fetchOffsetsRG8(S sampler, I32 offset) {

  uint16_t* buf = (uint16_t*)sampler->buf;

  U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]};

  Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f);

  Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f);

  return vec4(r, g, 0.0f, 1.0f);

template <typename S>

vec4 texelFetchRG8(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return fetchOffsetsRG8(sampler, offset);

template <typename S>

SI Float fetchOffsetsR16(S sampler, I32 offset) {

  U32 i = {

      ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y],

      ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]};

  return cast(i) * (1.0f / 65535.0f);

template <typename S>

vec4 texelFetchR16(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);

template <typename S>

SI vec4 fetchOffsetsRG16(S sampler, I32 offset) {

  U32 pixels = {sampler->buf[offset.x], sampler->buf[offset.y],

                sampler->buf[offset.z], sampler->buf[offset.w]};

  Float r = cast(pixels & 0xFFFF) * (1.0f / 65535.0f);

  Float g = cast(pixels >> 16) * (1.0f / 65535.0f);

  return vec4(r, g, 0.0f, 1.0f);

template <typename S>

vec4 texelFetchRG16(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return fetchOffsetsRG16(sampler, offset);

SI vec4 fetchOffsetsFloat(const uint32_t* buf, I32 offset) {

  return pixel_float_to_vec4(*(Float*)&buf[offset.x], *(Float*)&buf[offset.y],

                             *(Float*)&buf[offset.z], *(Float*)&buf[offset.w]);

SI vec4 fetchOffsetsFloat(samplerCommon* sampler, I32 offset) {

  return fetchOffsetsFloat(sampler->buf, offset);

vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {

  I32 offset = P.x * 4 + P.y * sampler->stride;

  return fetchOffsetsFloat(sampler, offset);

template <typename S>

SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) {

  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.

  // Offset is aligned to a chunk rather than a pixel, and selector specifies

  // pixel within the chunk.

  I32 selector = offset & 1;

  offset &= ~1;

  uint16_t* buf = (uint16_t*)sampler->buf;

  U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y],

                *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]};

  Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f);

  Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f);

  Float g =

      CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) *

      (1.0f / 255.0f);

  return vec4(r, g, b, 1.0f);

template <typename S>

vec4 texelFetchYUV422(S sampler, ivec2 P) {

  I32 offset = P.x + P.y * sampler->stride;

  return fetchOffsetsYUV422(sampler, offset);

vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  switch (sampler->format) {

    case TextureFormat::RGBA32F:

      return texelFetchFloat(sampler, P);

    case TextureFormat::RGBA8:

      return texelFetchRGBA8(sampler, P);

    case TextureFormat::R8:

      return texelFetchR8(sampler, P);

    case TextureFormat::RG8:

      return texelFetchRG8(sampler, P);

    case TextureFormat::R16:

      return texelFetchR16(sampler, P);

    case TextureFormat::RG16:

      return texelFetchRG16(sampler, P);

    case TextureFormat::YUV422:

      return texelFetchYUV422(sampler, P);

    default:

      assert(false);

      return vec4();

vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA32F);

  return texelFetchFloat(sampler, P);

vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA8);

  return texelFetchRGBA8(sampler, P);

vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::R8);

  return texelFetchR8(sampler, P);

vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RG8);

  return texelFetchRG8(sampler, P);

vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  if (sampler->format == TextureFormat::RGBA32F) {

    return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];

  } else {

    assert(sampler->format == TextureFormat::RGBA8);

    return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);

vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA32F);

  return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];

vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA8);

  return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);

vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::R8);

  return vec4_scalar{

      to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,

      0.0f, 1.0f};

vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RG8);

  uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride];

  return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f};

vec4 texelFetch(sampler2DRect sampler, ivec2 P) {

  P = clamp2D(P, sampler);

  switch (sampler->format) {

    case TextureFormat::RGBA8:

      return texelFetchRGBA8(sampler, P);

    case TextureFormat::R8:

      return texelFetchR8(sampler, P);

    case TextureFormat::RG8:

      return texelFetchRG8(sampler, P);

    case TextureFormat::R16:

      return texelFetchR16(sampler, P);

    case TextureFormat::RG16:

      return texelFetchRG16(sampler, P);

    case TextureFormat::YUV422:

      return texelFetchYUV422(sampler, P);

    default:

      assert(false);

      return vec4();

SI ivec4 fetchOffsetsInt(const uint32_t* buf, I32 offset) {

  return pixel_int_to_ivec4(*(I32*)&buf[offset.x], *(I32*)&buf[offset.y],

                            *(I32*)&buf[offset.z], *(I32*)&buf[offset.w]);

SI ivec4 fetchOffsetsInt(samplerCommon* sampler, I32 offset) {

  return fetchOffsetsInt(sampler->buf, offset);

ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA32I);

  I32 offset = P.x * 4 + P.y * sampler->stride;

  return fetchOffsetsInt(sampler, offset);

ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {

  assert(lod == 0);

  P = clamp2D(P, sampler);

  assert(sampler->format == TextureFormat::RGBA32I);

  return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];

constexpr int MAX_TEXEL_OFFSET = 8;

// Fill texelFetchOffset outside the valid texture bounds with zeroes. The

// stride will be set to 0 so that only one row of zeroes is needed.

static const uint32_t

    zeroFetchBuf[MAX_TEXEL_OFFSET * sizeof(Float) / sizeof(uint32_t)] = {0};

struct FetchScalar {

  const uint32_t* buf;

  uint32_t stride;

};

template <typename S>

SI FetchScalar texelFetchPtr(S sampler, ivec2_scalar P, int min_x, int max_x,

                             int min_y, int max_y) {

  assert(max_x < MAX_TEXEL_OFFSET);

  if (P.x < -min_x || P.x >= int(sampler->width) - max_x || P.y < -min_y ||

      P.y >= int(sampler->height) - max_y) {

    return FetchScalar{zeroFetchBuf, 0};

  return FetchScalar{&sampler->buf[P.x * 4 + P.y * sampler->stride],

                     sampler->stride};

SI vec4_scalar texelFetchUnchecked(sampler2D sampler, FetchScalar ptr, int x,

                                   int y = 0) {

  assert(sampler->format == TextureFormat::RGBA32F);

  return *(vec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride];

SI ivec4_scalar texelFetchUnchecked(isampler2D sampler, FetchScalar ptr, int x,

                                    int y = 0) {

  assert(sampler->format == TextureFormat::RGBA32I);

  return *(ivec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride];

struct FetchVector {

  const uint32_t* buf;

  I32 offset;

  uint32_t stride;

};

template <typename S>

SI FetchVector texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x,

                             int min_y, int max_y) {

  assert(max_x < MAX_TEXEL_OFFSET);

  if (test_any(P.x < -min_x || P.x >= int(sampler->width) - max_x ||

               P.y < -min_y || P.y >= int(sampler->height) - max_y)) {

    return FetchVector{zeroFetchBuf, I32(0), 0};

  return FetchVector{sampler->buf, P.x * 4 + P.y * sampler->stride,

                     sampler->stride};

SI vec4 texelFetchUnchecked(sampler2D sampler, FetchVector ptr, int x,

                            int y = 0) {

  assert(sampler->format == TextureFormat::RGBA32F);

  return fetchOffsetsFloat(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset);

SI ivec4 texelFetchUnchecked(isampler2D sampler, FetchVector ptr, int x,

                             int y = 0) {

  assert(sampler->format == TextureFormat::RGBA32I);

  return fetchOffsetsInt(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset);

#define texelFetchOffset(sampler, P, lod, offset) \

  texelFetch(sampler, (P) + (offset), lod)

// Scale texture coords for quantization, subtract offset for filtering

// (assuming coords already offset to texel centers), and round to nearest

// 1/scale increment

template <typename T>

SI T linearQuantize(T P, float scale) {

  return P * scale + (0.5f - 0.5f * scale);

// Helper version that also scales normalized texture coords for sampler

template <typename T, typename S>

SI T samplerScale(S sampler, T P) {

  P.x *= sampler->width;

  P.y *= sampler->height;

  return P;

template <typename T>

SI T samplerScale(UNUSED sampler2DRect sampler, T P) {

  return P;

template <typename T, typename S>

SI T linearQuantize(T P, float scale, S sampler) {

  return linearQuantize(samplerScale(sampler, P), scale);

// Compute clamped offset of first row for linear interpolation

template <typename S, typename I>

SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) {

  return clampCoord(i.x, sampler->width - margin) +

         clampCoord(i.y, sampler->height) * sampler->stride;

// Compute clamped offset of second row for linear interpolation from first row

template <typename S, typename I>

SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) {

  return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1,

                      sampler->stride, 0);

// Convert X coordinate to a 2^7 scale fraction for interpolation

template <typename S>

SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {

  auto overread = i.x > int32_t(sampler->width) - 2;

  return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16);

// Convert Y coordinate to a 2^7 scale fraction for interpolation

SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); }

SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); }

struct WidePlanarRGBA8 {

  V8<uint16_t> rg;

  V8<uint16_t> ba;

};

template <typename S>

SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RGBA8);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx = computeFracX(sampler, i, frac);

  I16 fracy = computeFracY(frac);

  auto a0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);

  auto a1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);

  a0 += ((a1 - a0) * fracy.x) >> 7;

  auto b0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);

  auto b1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);

  b0 += ((b1 - b0) * fracy.y) >> 7;

  auto abl = zipLow(a0, b0);

  auto abh = zipHigh(a0, b0);

  abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;

  auto c0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);

  auto c1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);

  c0 += ((c1 - c0) * fracy.z) >> 7;

  auto d0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);

  auto d1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);

  d0 += ((d1 - d0) * fracy.w) >> 7;

  auto cdl = zipLow(c0, d0);

  auto cdh = zipHigh(c0, d0);

  cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;

  auto rg = V8<uint16_t>(zip2Low(abl, cdl));

  auto ba = V8<uint16_t>(zip2High(abl, cdl));

  return WidePlanarRGBA8{rg, ba};

template <typename S>

vec4 textureLinearRGBA8(S sampler, vec2 P) {

  ivec2 i(linearQuantize(P, 128, sampler));

  auto planar = textureLinearPlanarRGBA8(sampler, i);

  auto rg = CONVERT(planar.rg, V8<float>);

  auto ba = CONVERT(planar.ba, V8<float>);

  auto r = lowHalf(rg);

  auto g = highHalf(rg);

  auto b = lowHalf(ba);

  auto a = highHalf(ba);

  return vec4(b, g, r, a) * (1.0f / 255.0f);

template <typename S>

static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::R8);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx = computeFracX(sampler, i, frac);

  I16 fracy = computeFracY(frac);

  uint8_t* buf = (uint8_t*)sampler->buf;

  auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]);

  auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);

  auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);

  auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);

  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);

  auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);

  auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);

  auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);

  auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);

  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);

  abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;

  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);

  auto abcdl = lowHalf(abcd0);

  auto abcdh = highHalf(abcd0);

  abcdl += ((abcdh - abcdl) * fracx) >> 7;

  return U16(abcdl);

template <typename S>

vec4 textureLinearR8(S sampler, vec2 P) {

  assert(sampler->format == TextureFormat::R8);

  ivec2 i(linearQuantize(P, 128, sampler));

  Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float);

  return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);

struct WidePlanarRG8 {

  V8<uint16_t> rg;

};

template <typename S>

SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RG8);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx = computeFracX(sampler, i, frac);

  I16 fracy = computeFracY(frac);

  uint16_t* buf = (uint16_t*)sampler->buf;

  // Load RG bytes for two adjacent pixels - rgRG

  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);

  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);

  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);

  // Load two pixels for next row

  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);

  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);

  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);

  // Blend rows

  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;

  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);

  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);

  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);

  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);

  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);

  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);

  // Blend rows

  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;

  // ab = a.rgRG,b.rgRG

  // cd = c.rgRG,d.rgRG

  // ... ac = ar,cr,ag,cg,aR,cR,aG,cG

  // ... bd = br,dr,bg,dg,bR,dR,bG,dG

  auto ac = zipLow(ab0, cd0);

  auto bd = zipHigh(ab0, cd0);

  // ar,br,cr,dr,ag,bg,cg,dg

  // aR,bR,cR,dR,aG,bG,cG,dG

  auto abcdl = zipLow(ac, bd);

  auto abcdh = zipHigh(ac, bd);

  // Blend columns

  abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7;

  auto rg = V8<uint16_t>(abcdl);

  return WidePlanarRG8{rg};

template <typename S>

vec4 textureLinearRG8(S sampler, vec2 P) {

  ivec2 i(linearQuantize(P, 128, sampler));

  auto planar = textureLinearPlanarRG8(sampler, i);

  auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f);

  auto r = lowHalf(rg);

  auto g = highHalf(rg);

  return vec4(r, g, 0.0f, 1.0f);

// Samples R16 texture with linear filtering and returns results packed as

// signed I16. One bit of precision is shifted away from the bottom end to

// accommodate the sign bit, so only 15 bits of precision is left.

template <typename S>

static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::R16);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx =

      CONVERT(

          ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,

          I16)

      << 8;

  I16 fracy = computeFracY(frac) << 8;

  // Sample the 16 bit data for both rows

  uint16_t* buf = (uint16_t*)sampler->buf;

  auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]);

  auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);

  auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);

  auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);

  auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>);

  auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);

  auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);

  auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);

  auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);

  auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>);

  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when

  // they are multiplied together, the new scaled sample will fit in the high

  // 14 bits of the result. It is left shifted once to make it 15 bits again

  // for the final multiply.

#if USE_SSE2

  abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww))

           << 1;

#elif USE_NEON

  // NEON has a convenient instruction that does both the multiply and the

  // doubling, so doesn't need an extra shift.

  abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww));

#else

  abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) *

                    CONVERT(fracy.xxyyzzww, V8<int32_t>)) >>

16,

                   V8<int16_t>)

           << 1;

#endif

  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);

  auto abcdl = lowHalf(abcd0);

  auto abcdh = highHalf(abcd0);

#if USE_SSE2

  abcdl += lowHalf(bit_cast<V8<int16_t>>(

               _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx))))

           << 1;

#elif USE_NEON

  abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx));

#else

  abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) *

                    CONVERT(fracx, V4<int32_t>)) >>

16,

                   V4<int16_t>)

           << 1;

#endif

  return abcdl;

template <typename S>

vec4 textureLinearR16(S sampler, vec2 P) {

  assert(sampler->format == TextureFormat::R16);

  ivec2 i(linearQuantize(P, 128, sampler));

  Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float);

  return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);

// Samples RG16 texture with linear filtering and returns results packed as

// signed I16. One bit of precision is shifted away from the bottom end to

// accommodate the sign bit, so only 15 bits of precision is left.

template <typename S>

static inline V8<int16_t> textureLinearUnpackedRG16(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RG16);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx =

      CONVERT(

          ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,

          I16)

      << 8;

  I16 fracy = computeFracY(frac) << 8;

  // Sample the 2x16 bit data for both rows

  auto a0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.x]);

  auto b0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.y]);

  auto ab0 = CONVERT(combine(a0, b0) >> 1, V8<int16_t>);

  auto c0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.z]);

  auto d0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.w]);

  auto cd0 = CONVERT(combine(c0, d0) >> 1, V8<int16_t>);

  auto a1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.x]);

  auto b1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.y]);

  auto ab1 = CONVERT(combine(a1, b1) >> 1, V8<int16_t>);

  auto c1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.z]);

  auto d1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.w]);

  auto cd1 = CONVERT(combine(c1, d1) >> 1, V8<int16_t>);

  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when

  // they are multiplied together, the new scaled sample will fit in the high

  // 14 bits of the result. It is left shifted once to make it 15 bits again

  // for the final multiply.

#if USE_SSE2

  ab0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(ab1 - ab0, fracy.xxxxyyyy)) << 1;

  cd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(cd1 - cd0, fracy.zzzzwwww)) << 1;

#elif USE_NEON

  // NEON has a convenient instruction that does both the multiply and the

  // doubling, so doesn't need an extra shift.

  ab0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(ab1 - ab0, fracy.xxxxyyyy));

  cd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(cd1 - cd0, fracy.zzzzwwww));

#else

  ab0 += CONVERT((CONVERT(ab1 - ab0, V8<int32_t>) *

                  CONVERT(fracy.xxxxyyyy, V8<int32_t>)) >>

16,

                 V8<int16_t>)

         << 1;

  cd0 += CONVERT((CONVERT(cd1 - cd0, V8<int32_t>) *

                  CONVERT(fracy.zzzzwwww, V8<int32_t>)) >>

16,

                 V8<int16_t>)

         << 1;

#endif

  // ab = a.rgRG,b.rgRG

  // cd = c.rgRG,d.rgRG

  // ... ac = a.rg,c.rg,a.RG,c.RG

  // ... bd = b.rg,d.rg,b.RG,d.RG

  auto ac = zip2Low(ab0, cd0);

  auto bd = zip2High(ab0, cd0);

  // a.rg,b.rg,c.rg,d.rg

  // a.RG,b.RG,c.RG,d.RG

  auto abcdl = zip2Low(ac, bd);

  auto abcdh = zip2High(ac, bd);

  // Blend columns

#if USE_SSE2

  abcdl += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcdh - abcdl, fracx.xxyyzzww))

           << 1;

#elif USE_NEON

  abcdl += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcdh - abcdl, fracx.xxyyzzww));

#else

  abcdl += CONVERT((CONVERT(abcdh - abcdl, V8<int32_t>) *

                    CONVERT(fracx.xxyyzzww, V8<int32_t>)) >>

16,

                   V8<int16_t>)

           << 1;

#endif

  return abcdl;

template <typename S>

vec4 textureLinearRG16(S sampler, vec2 P) {

  assert(sampler->format == TextureFormat::RG16);

  ivec2 i(linearQuantize(P, 128, sampler));

  auto rg = bit_cast<V4<int32_t>>(textureLinearUnpackedRG16(sampler, i));

  auto r = cast(rg & 0xFFFF) * (1.0f / 32767.0f);

  auto g = cast(rg >> 16) * (1.0f / 32767.0f);

  return vec4(r, g, 0.0f, 1.0f);

using PackedRGBA32F = V16<float>;

using WideRGBA32F = V16<float>;

template <typename S>

vec4 textureLinearRGBA32F(S sampler, vec2 P) {

  assert(sampler->format == TextureFormat::RGBA32F);

  P = samplerScale(sampler, P);

  P -= 0.5f;

  vec2 f = floor(P);

  vec2 r = P - f;

  ivec2 i(f);

  ivec2 c(clampCoord(i.x, sampler->width - 1),

          clampCoord(i.y, sampler->height));

  r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0),

                     0.0f);

  I32 offset0 = c.x * 4 + c.y * sampler->stride;

  I32 offset1 = offset0 + computeNextRowOffset(sampler, i);

  Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],

                     *(Float*)&sampler->buf[offset0.x + 4], r.x),

                 mix(*(Float*)&sampler->buf[offset1.x],

                     *(Float*)&sampler->buf[offset1.x + 4], r.x),

                 r.y);

  Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],

                     *(Float*)&sampler->buf[offset0.y + 4], r.x),

                 mix(*(Float*)&sampler->buf[offset1.y],

                     *(Float*)&sampler->buf[offset1.y + 4], r.x),

                 r.y);

  Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],

                     *(Float*)&sampler->buf[offset0.z + 4], r.x),

                 mix(*(Float*)&sampler->buf[offset1.z],

                     *(Float*)&sampler->buf[offset1.z + 4], r.x),

                 r.y);

  Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],

                     *(Float*)&sampler->buf[offset0.w + 4], r.x),

                 mix(*(Float*)&sampler->buf[offset1.w],

                     *(Float*)&sampler->buf[offset1.w + 4], r.x),

                 r.y);

  return pixel_float_to_vec4(c0, c1, c2, c3);

struct WidePlanarYUV8 {

  U16 y;

  U16 u;

  U16 v;

};

template <typename S>

SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::YUV422);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i, 2);

  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.

  // Get the selector for the pixel within the chunk.

  I32 selector = row0 & 1;

  // Align the row index to the chunk.

  row0 &= ~1;

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  // G only needs to be clamped to a pixel boundary for safe interpolation,

  // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk

  // boundary.

  frac.x &= (i.x >= 0);

  auto fracx =

      CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3),

                      (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) &

                  0x7F,

              V8<int16_t>);

  I16 fracy = computeFracY(frac);

  uint16_t* buf = (uint16_t*)sampler->buf;

  // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R

  // We always need to interpolate between (b,r) and (B,R).

  // Depending on selector we need to either interpolate between g0 and g1

  // or between g1 and G0. So for now we just interpolate both cases for g

  // and will select the appropriate one on output.

  auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>);

  auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>);

  // Combine with next row.

  a0 += ((a1 - a0) * fracy.x) >> 7;

  auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>);

  auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>);

  b0 += ((b1 - b0) * fracy.y) >> 7;

  auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>);

  auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>);

  c0 += ((c1 - c0) * fracy.z) >> 7;

  auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>);

  auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>);

  d0 += ((d1 - d0) * fracy.w) >> 7;

  // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and

  // g1,g1,g1,g1,r,r,r,r.

  auto abl = zipLow(a0, b0);

  auto cdl = zipLow(c0, d0);

  auto g0b = zip2Low(abl, cdl);

  auto g1r = zip2High(abl, cdl);

  // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and

  // and shifts, just shuffle here instead... We finally end up with

  // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R.

  auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15);

  auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15);

  auto g1B = zip2Low(abh, cdh);

  auto G0R = zip2High(abh, cdh);

  // Finally interpolate between adjacent columns.

  g0b += ((g1B - g0b) * fracx) >> 7;

  g1r += ((G0R - g1r) * fracx) >> 7;

  // Choose either g0 or g1 based on selector.

  return WidePlanarYUV8{

      U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))),

      U16(highHalf(g0b)), U16(highHalf(g1r))};

template <typename S>

vec4 textureLinearYUV422(S sampler, vec2 P) {

  ivec2 i(linearQuantize(P, 128, sampler));

  auto planar = textureLinearPlanarYUV422(sampler, i);

  auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f);

  auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f);

  auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f);

  return vec4(v, y, u, 1.0f);

SI vec4 texture(sampler2D sampler, vec2 P) {

  if (sampler->filter == TextureFilter::LINEAR) {

    switch (sampler->format) {

      case TextureFormat::RGBA32F:

        return textureLinearRGBA32F(sampler, P);

      case TextureFormat::RGBA8:

        return textureLinearRGBA8(sampler, P);

      case TextureFormat::R8:

        return textureLinearR8(sampler, P);

      case TextureFormat::RG8:

        return textureLinearRG8(sampler, P);

      case TextureFormat::R16:

        return textureLinearR16(sampler, P);

      case TextureFormat::RG16:

        return textureLinearRG16(sampler, P);

      case TextureFormat::YUV422:

        return textureLinearYUV422(sampler, P);

      default:

        assert(false);

        return vec4();

  } else {

    ivec2 coord(roundzero(P.x, sampler->width),

                roundzero(P.y, sampler->height));

    return texelFetch(sampler, coord, 0);

vec4 texture(sampler2DRect sampler, vec2 P) {

  if (sampler->filter == TextureFilter::LINEAR) {

    switch (sampler->format) {

      case TextureFormat::RGBA8:

        return textureLinearRGBA8(sampler, P);

      case TextureFormat::R8:

        return textureLinearR8(sampler, P);

      case TextureFormat::RG8:

        return textureLinearRG8(sampler, P);

      case TextureFormat::R16:

        return textureLinearR16(sampler, P);

      case TextureFormat::RG16:

        return textureLinearRG16(sampler, P);

      case TextureFormat::YUV422:

        return textureLinearYUV422(sampler, P);

      default:

        assert(false);

        return vec4();

  } else {

    ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));

    return texelFetch(sampler, coord);

template <typename S>

vec4_scalar texture(S sampler, vec2_scalar P) {

  return force_scalar(texture(sampler, vec2(P)));

ivec2_scalar textureSize(sampler2D sampler, int) {

  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};

ivec2_scalar textureSize(sampler2DRect sampler) {

  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};

template <typename S>

static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RGBA8);

  ivec2 frac = i;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx = computeFracX(sampler, i, frac);

  I16 fracy = computeFracY(frac);

  auto a0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);

  auto a1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);

  a0 += ((a1 - a0) * fracy.x) >> 7;

  auto b0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);

  auto b1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);

  b0 += ((b1 - b0) * fracy.y) >> 7;

  auto abl = combine(lowHalf(a0), lowHalf(b0));

  auto abh = combine(highHalf(a0), highHalf(b0));

  abl += ((abh - abl) * fracx.xxxxyyyy) >> 7;

  auto c0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);

  auto c1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);

  c0 += ((c1 - c0) * fracy.z) >> 7;

  auto d0 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);

  auto d1 =

      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);

  d0 += ((d1 - d0) * fracy.w) >> 7;

  auto cdl = combine(lowHalf(c0), lowHalf(d0));

  auto cdh = combine(highHalf(c0), highHalf(d0));

  cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7;

  return combine(HalfRGBA8(abl), HalfRGBA8(cdl));

template <typename S>

static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) {

  return pack(textureLinearUnpackedRGBA8(sampler, i));

template <typename S>

static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RGBA8);

  I32 row = computeRow(sampler, i, 0);

  return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]),

                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]),

                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]),

                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.w]));

template <typename S>

static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) {

  return pack(textureLinearUnpackedR8(sampler, i));

template <typename S>

static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) {

  assert(sampler->format == TextureFormat::RG8);

  ivec2 frac = i & 0x7F;

  i >>= 7;

  I32 row0 = computeRow(sampler, i);

  I32 row1 = row0 + computeNextRowOffset(sampler, i);

  I16 fracx = computeFracX(sampler, i, frac);

  I16 fracy = computeFracY(frac);

  uint16_t* buf = (uint16_t*)sampler->buf;

  // Load RG bytes for two adjacent pixels - rgRG

  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);

  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);

  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);

  // Load two pixels for next row

  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);

  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);

  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);

  // Blend rows

  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;

  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);

  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);

  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);

  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);

  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);

  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);

  // Blend rows

  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;

  // ab = a.rgRG,b.rgRG

  // cd = c.rgRG,d.rgRG

  // ... ac = a.rg,c.rg,a.RG,c.RG

  // ... bd = b.rg,d.rg,b.RG,d.RG

  auto ac = zip2Low(ab0, cd0);

  auto bd = zip2High(ab0, cd0);

  // a.rg,b.rg,c.rg,d.rg

  // a.RG,b.RG,c.RG,d.RG

  auto abcdl = zip2Low(ac, bd);

  auto abcdh = zip2High(ac, bd);

  // Blend columns

  abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7;

  return WideRG8(abcdl);

template <typename S>

static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) {

  return pack(textureLinearUnpackedRG8(sampler, i));

template <int N>

static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x,

                                                    VectorType<uint16_t, N> y) {

  auto r = x + y;

  return r | (r < x);

static inline V8<uint16_t> addsat(V8<uint16_t> x, V8<uint16_t> y) {

#if USE_SSE2

  return _mm_adds_epu16(x, y);

#elif USE_NEON

  return vqaddq_u16(x, y);

#else

  auto r = x + y;

  return r | (r < x);

#endif

template <typename P, typename S>

static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal(

    S sampler, const ivec2_scalar& i, int minX, int maxX, int radius,

    float coeff, float coeffStep) {

  // Packed and unpacked vectors for a chunk of the given pixel type.

  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;

  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;

  // Pre-scale the coefficient by 8 bits of fractional precision, so that when

  // the sample is multiplied by it, it will yield a 16 bit unsigned integer

  // that will use all 16 bits of precision to accumulate the sum.

  coeff *= 1 << 8;

  float coeffStep2 = coeffStep * coeffStep;

  int row = computeRow(sampler, i);

  P* buf = (P*)sampler->buf;

  auto pixelsRight = unaligned_load<V4<P>>(&buf[row]);

  auto pixelsLeft = pixelsRight;

  auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) *

             uint16_t(coeff + 0.5f);

  // Here we use some trickery to reuse the pixels within a chunk, shifted over

  // by one pixel, to get the next sample for the entire chunk. This allows us

  // to sample only one pixel for each offset across the entire chunk in both

  // the left and right directions. To avoid clamping within the loop to the

  // texture bounds, we compute the valid radius that doesn't require clamping

  // and fall back to a slower clamping loop outside of that valid radius.

  int offset = 1;

  // The left bound is how much we can offset the sample before the start of

  // the row bounds.

  int leftBound = i.x - max(minX, 0);

  // The right bound is how much we can offset the sample before the end of the

  // row bounds.

  int rightBound = min(maxX, sampler->width - 1) - i.x;

  int validRadius = min(radius, min(leftBound, rightBound - (4 - 1)));

  for (; offset <= validRadius; offset++) {

    // Overwrite the pixel that needs to be shifted out with the new pixel, and

    // shift it into the correct location.

    pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]);

    pixelsRight = pixelsRight.yzwx;

    pixelsLeft = pixelsLeft.wxyz;

    pixelsLeft.x = unaligned_load<P>(&buf[row - offset]);

    // Accumulate the Gaussian coefficients step-wise.

    coeff *= coeffStep;

    coeffStep *= coeffStep2;

    // Both left and right samples at this offset use the same coefficient.

    sum = addsat(sum,

                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +

                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *

                     uint16_t(coeff + 0.5f));

  for (; offset <= radius; offset++) {

    pixelsRight.x =

        unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]);

    pixelsRight = pixelsRight.yzwx;

    pixelsLeft = pixelsLeft.wxyz;

    pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]);

    coeff *= coeffStep;

    coeffStep *= coeffStep2;

    sum = addsat(sum,

                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +

                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *

                     uint16_t(coeff + 0.5f));

  // Shift away the intermediate precision.

  return sum >> 8;

template <typename P, typename S>

static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical(

    S sampler, const ivec2_scalar& i, int minY, int maxY, int radius,

    float coeff, float coeffStep) {

  // Packed and unpacked vectors for a chunk of the given pixel type.

  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;

  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;

  // Pre-scale the coefficient by 8 bits of fractional precision, so that when

  // the sample is multiplied by it, it will yield a 16 bit unsigned integer

  // that will use all 16 bits of precision to accumulate the sum.

  coeff *= 1 << 8;

  float coeffStep2 = coeffStep * coeffStep;

  int rowAbove = computeRow(sampler, i);

  int rowBelow = rowAbove;

  P* buf = (P*)sampler->buf;

  auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]);

  auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) *

             uint16_t(coeff + 0.5f);

  // For the vertical loop we can't be quite as creative with reusing old values

  // as we were in the horizontal loop. We just do the obvious implementation of

  // loading a chunk from each row in turn and accumulating it into the sum. We

  // compute a valid radius within which we don't need to clamp the sampled row

  // and use that to avoid any clamping in the main inner loop. We fall back to

  // a slower clamping loop outside of that valid radius.

  int offset = 1;

  int belowBound = i.y - max(minY, 0);

  int aboveBound = min(maxY, sampler->height - 1) - i.y;

  int validRadius = min(radius, min(belowBound, aboveBound));

  for (; offset <= validRadius; offset++) {

    rowAbove += sampler->stride;

    rowBelow -= sampler->stride;

    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);

    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);

    // Accumulate the Gaussian coefficients step-wise.

    coeff *= coeffStep;

    coeffStep *= coeffStep2;

    // Both above and below samples at this offset use the same coefficient.

    sum = addsat(sum,

                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +

                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *

                     uint16_t(coeff + 0.5f));

  for (; offset <= radius; offset++) {

    if (offset <= aboveBound) {

      rowAbove += sampler->stride;

    if (offset <= belowBound) {

      rowBelow -= sampler->stride;

    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);

    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);

    coeff *= coeffStep;

    coeffStep *= coeffStep2;

    sum = addsat(sum,

                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +

                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *

                     uint16_t(coeff + 0.5f));

  // Shift away the intermediate precision.

  return sum >> 8;

}  // namespace glsl