Source code

Revision control

Other Tools

1
/*
2
* Copyright 2018 Google Inc.
3
*
4
* Use of this source code is governed by a BSD-style license that can be
5
* found in the LICENSE file.
6
*/
7
8
#if defined(__AVX2__)
9
10
#include <immintrin.h>
11
#include <stdint.h>
12
13
namespace hsw {
14
15
void convolve_vertically(const int16_t* filter, int filterLen,
16
uint8_t* const* srcRows, int width,
17
uint8_t* out, bool hasAlpha) {
18
// It's simpler to work with the output array in terms of 4-byte pixels.
19
auto dst = (int*)out;
20
21
// Output up to eight pixels per iteration.
22
for (int x = 0; x < width; x += 8) {
23
// Accumulated result for 4 (non-adjacent) pairs of pixels,
24
// with each channel in signed 17.14 fixed point.
25
auto accum04 = _mm256_setzero_si256(),
26
accum15 = _mm256_setzero_si256(),
27
accum26 = _mm256_setzero_si256(),
28
accum37 = _mm256_setzero_si256();
29
30
// Convolve with the filter. (This inner loop is where we spend ~all our time.)
31
// While we can, we consume 2 filter coefficients and 2 rows of 8 pixels each at a time.
32
auto convolve_16_pixels = [&](__m256i interlaced_coeffs,
33
__m256i pixels_01234567, __m256i pixels_89ABCDEF) {
34
// Interlaced R0R8 G0G8 B0B8 A0A8 R1R9 G1G9... 32 8-bit values each.
35
auto _08194C5D = _mm256_unpacklo_epi8(pixels_01234567, pixels_89ABCDEF),
36
_2A3B6E7F = _mm256_unpackhi_epi8(pixels_01234567, pixels_89ABCDEF);
37
38
// Still interlaced R0R8 G0G8... as above, each channel expanded to 16-bit lanes.
39
auto _084C = _mm256_unpacklo_epi8(_08194C5D, _mm256_setzero_si256()),
40
_195D = _mm256_unpackhi_epi8(_08194C5D, _mm256_setzero_si256()),
41
_2A6E = _mm256_unpacklo_epi8(_2A3B6E7F, _mm256_setzero_si256()),
42
_3B7F = _mm256_unpackhi_epi8(_2A3B6E7F, _mm256_setzero_si256());
43
44
// accum0_R += R0*coeff0 + R8*coeff1, etc.
45
accum04 = _mm256_add_epi32(accum04, _mm256_madd_epi16(_084C, interlaced_coeffs));
46
accum15 = _mm256_add_epi32(accum15, _mm256_madd_epi16(_195D, interlaced_coeffs));
47
accum26 = _mm256_add_epi32(accum26, _mm256_madd_epi16(_2A6E, interlaced_coeffs));
48
accum37 = _mm256_add_epi32(accum37, _mm256_madd_epi16(_3B7F, interlaced_coeffs));
49
};
50
51
int i = 0;
52
for (; i < filterLen/2*2; i += 2) {
53
convolve_16_pixels(_mm256_set1_epi32(*(const int32_t*)(filter+i)),
54
_mm256_loadu_si256((const __m256i*)(srcRows[i+0] + x*4)),
55
_mm256_loadu_si256((const __m256i*)(srcRows[i+1] + x*4)));
56
}
57
if (i < filterLen) {
58
convolve_16_pixels(_mm256_set1_epi32(*(const int16_t*)(filter+i)),
59
_mm256_loadu_si256((const __m256i*)(srcRows[i] + x*4)),
60
_mm256_setzero_si256());
61
}
62
63
// Trim the fractional parts off the accumulators.
64
accum04 = _mm256_srai_epi32(accum04, 14);
65
accum15 = _mm256_srai_epi32(accum15, 14);
66
accum26 = _mm256_srai_epi32(accum26, 14);
67
accum37 = _mm256_srai_epi32(accum37, 14);
68
69
// Pack back down to 8-bit channels.
70
auto pixels = _mm256_packus_epi16(_mm256_packs_epi32(accum04, accum15),
71
_mm256_packs_epi32(accum26, accum37));
72
73
if (hasAlpha) {
74
// Clamp alpha to the max of r,g,b to make sure we stay premultiplied.
75
__m256i max_rg = _mm256_max_epu8(pixels, _mm256_srli_epi32(pixels, 8)),
76
max_rgb = _mm256_max_epu8(max_rg, _mm256_srli_epi32(pixels, 16));
77
pixels = _mm256_max_epu8(pixels, _mm256_slli_epi32(max_rgb, 24));
78
} else {
79
// Force opaque.
80
pixels = _mm256_or_si256(pixels, _mm256_set1_epi32(0xff000000));
81
}
82
83
// Normal path to store 8 pixels.
84
if (x + 8 <= width) {
85
_mm256_storeu_si256((__m256i*)dst, pixels);
86
dst += 8;
87
continue;
88
}
89
90
// Store one pixel at a time on the last iteration.
91
for (int i = x; i < width; i++) {
92
*dst++ = _mm_cvtsi128_si32(_mm256_castsi256_si128(pixels));
93
pixels = _mm256_permutevar8x32_epi32(pixels, _mm256_setr_epi32(1,2,3,4,5,6,7,0));
94
}
95
}
96
}
97
98
}
99
100
#include "src/core/SkOpts.h"
101
102
#define SK_OPTS_NS hsw
103
#include "src/core/SkCubicSolver.h"
104
#include "src/opts/SkBitmapProcState_opts.h"
105
#include "src/opts/SkBlitRow_opts.h"
106
#include "src/opts/SkRasterPipeline_opts.h"
107
#include "src/opts/SkUtils_opts.h"
108
109
namespace SkOpts {
110
// See SkOpts.h, writing SkConvolutionFilter1D::ConvolutionFixed as the underlying type.
111
extern void (*convolve_vertically)(const int16_t* filter, int filterLen,
112
uint8_t* const* srcRows, int width,
113
uint8_t* out, bool hasAlpha);
114
void Init_hsw() {
115
convolve_vertically = hsw::convolve_vertically;
116
117
blit_row_color32 = hsw::blit_row_color32;
118
blit_row_s32a_opaque = hsw::blit_row_s32a_opaque;
119
120
S32_alpha_D32_filter_DX = hsw::S32_alpha_D32_filter_DX;
121
122
cubic_solver = SK_OPTS_NS::cubic_solver;
123
124
#define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
125
SK_RASTER_PIPELINE_STAGES(M)
126
just_return_highp = (StageFn)SK_OPTS_NS::just_return;
127
start_pipeline_highp = SK_OPTS_NS::start_pipeline;
128
#undef M
129
130
#define M(st) stages_lowp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::lowp::st;
131
SK_RASTER_PIPELINE_STAGES(M)
132
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
133
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
134
#undef M
135
}
136
}
137
138
#else // defined(__AVX2__) is not true...
139
140
namespace SkOpts { void Init_hsw() {} }
141
142
#endif