Revision control
Copy as Markdown
Other Tools
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#![allow(clippy::type_complexity)]
#![allow(clippy::erasing_op)]
#![allow(clippy::identity_op)]
use jxl_simd::{F32SimdVec, SimdDescriptor};
#[allow(clippy::too_many_arguments)]
#[allow(clippy::excessive_precision)]
#[inline(always)]
pub(super) fn reinterpreting_dct_32<D: SimdDescriptor>(
d: D,
v0: D::F32Vec,
v1: D::F32Vec,
v2: D::F32Vec,
v3: D::F32Vec,
v4: D::F32Vec,
v5: D::F32Vec,
v6: D::F32Vec,
v7: D::F32Vec,
v8: D::F32Vec,
v9: D::F32Vec,
v10: D::F32Vec,
v11: D::F32Vec,
v12: D::F32Vec,
v13: D::F32Vec,
v14: D::F32Vec,
v15: D::F32Vec,
v16: D::F32Vec,
v17: D::F32Vec,
v18: D::F32Vec,
v19: D::F32Vec,
v20: D::F32Vec,
v21: D::F32Vec,
v22: D::F32Vec,
v23: D::F32Vec,
v24: D::F32Vec,
v25: D::F32Vec,
v26: D::F32Vec,
v27: D::F32Vec,
v28: D::F32Vec,
v29: D::F32Vec,
v30: D::F32Vec,
v31: D::F32Vec,
) -> (
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
D::F32Vec,
) {
let v32 = v0 + v31;
let v33 = v1 + v30;
let v34 = v2 + v29;
let v35 = v3 + v28;
let v36 = v4 + v27;
let v37 = v5 + v26;
let v38 = v6 + v25;
let v39 = v7 + v24;
let v40 = v8 + v23;
let v41 = v9 + v22;
let v42 = v10 + v21;
let v43 = v11 + v20;
let v44 = v12 + v19;
let v45 = v13 + v18;
let v46 = v14 + v17;
let v47 = v15 + v16;
let v48 = v32 + v47;
let v49 = v33 + v46;
let v50 = v34 + v45;
let v51 = v35 + v44;
let v52 = v36 + v43;
let v53 = v37 + v42;
let v54 = v38 + v41;
let v55 = v39 + v40;
let v56 = v48 + v55;
let v57 = v49 + v54;
let v58 = v50 + v53;
let v59 = v51 + v52;
let v60 = v56 + v59;
let v61 = v57 + v58;
let v62 = v60 + v61;
let v63 = v60 - v61;
let v64 = v56 - v59;
let v65 = v57 - v58;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v66 = v64 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v67 = v65 * mul;
let v68 = v66 + v67;
let v69 = v66 - v67;
let v70 = v68.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v69);
let v71 = v48 - v55;
let v72 = v49 - v54;
let v73 = v50 - v53;
let v74 = v51 - v52;
let mul = D::F32Vec::splat(d, 0.5097955791041592);
let v75 = v71 * mul;
let mul = D::F32Vec::splat(d, 0.6013448869350453);
let v76 = v72 * mul;
let mul = D::F32Vec::splat(d, 0.8999762231364156);
let v77 = v73 * mul;
let mul = D::F32Vec::splat(d, 2.5629154477415055);
let v78 = v74 * mul;
let v79 = v75 + v78;
let v80 = v76 + v77;
let v81 = v79 + v80;
let v82 = v79 - v80;
let v83 = v75 - v78;
let v84 = v76 - v77;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v85 = v83 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v86 = v84 * mul;
let v87 = v85 + v86;
let v88 = v85 - v86;
let v89 = v87.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v88);
let v90 = v81.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v89);
let v91 = v89 + v82;
let v92 = v82 + v88;
let v93 = v32 - v47;
let v94 = v33 - v46;
let v95 = v34 - v45;
let v96 = v35 - v44;
let v97 = v36 - v43;
let v98 = v37 - v42;
let v99 = v38 - v41;
let v100 = v39 - v40;
let mul = D::F32Vec::splat(d, 0.5024192861881557);
let v101 = v93 * mul;
let mul = D::F32Vec::splat(d, 0.5224986149396889);
let v102 = v94 * mul;
let mul = D::F32Vec::splat(d, 0.5669440348163577);
let v103 = v95 * mul;
let mul = D::F32Vec::splat(d, 0.6468217833599901);
let v104 = v96 * mul;
let mul = D::F32Vec::splat(d, 0.7881546234512502);
let v105 = v97 * mul;
let mul = D::F32Vec::splat(d, 1.0606776859903471);
let v106 = v98 * mul;
let mul = D::F32Vec::splat(d, 1.7224470982383342);
let v107 = v99 * mul;
let mul = D::F32Vec::splat(d, 5.1011486186891553);
let v108 = v100 * mul;
let v109 = v101 + v108;
let v110 = v102 + v107;
let v111 = v103 + v106;
let v112 = v104 + v105;
let v113 = v109 + v112;
let v114 = v110 + v111;
let v115 = v113 + v114;
let v116 = v113 - v114;
let v117 = v109 - v112;
let v118 = v110 - v111;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v119 = v117 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v120 = v118 * mul;
let v121 = v119 + v120;
let v122 = v119 - v120;
let v123 = v121.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v122);
let v124 = v101 - v108;
let v125 = v102 - v107;
let v126 = v103 - v106;
let v127 = v104 - v105;
let mul = D::F32Vec::splat(d, 0.5097955791041592);
let v128 = v124 * mul;
let mul = D::F32Vec::splat(d, 0.6013448869350453);
let v129 = v125 * mul;
let mul = D::F32Vec::splat(d, 0.8999762231364156);
let v130 = v126 * mul;
let mul = D::F32Vec::splat(d, 2.5629154477415055);
let v131 = v127 * mul;
let v132 = v128 + v131;
let v133 = v129 + v130;
let v134 = v132 + v133;
let v135 = v132 - v133;
let v136 = v128 - v131;
let v137 = v129 - v130;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v138 = v136 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v139 = v137 * mul;
let v140 = v138 + v139;
let v141 = v138 - v139;
let v142 = v140.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v141);
let v143 = v134.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v142);
let v144 = v142 + v135;
let v145 = v135 + v141;
let v146 = v115.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v143);
let v147 = v143 + v123;
let v148 = v123 + v144;
let v149 = v144 + v116;
let v150 = v116 + v145;
let v151 = v145 + v122;
let v152 = v122 + v141;
let v153 = v0 - v31;
let v154 = v1 - v30;
let v155 = v2 - v29;
let v156 = v3 - v28;
let v157 = v4 - v27;
let v158 = v5 - v26;
let v159 = v6 - v25;
let v160 = v7 - v24;
let v161 = v8 - v23;
let v162 = v9 - v22;
let v163 = v10 - v21;
let v164 = v11 - v20;
let v165 = v12 - v19;
let v166 = v13 - v18;
let v167 = v14 - v17;
let v168 = v15 - v16;
let mul = D::F32Vec::splat(d, 0.5006029982351963);
let v169 = v153 * mul;
let mul = D::F32Vec::splat(d, 0.5054709598975436);
let v170 = v154 * mul;
let mul = D::F32Vec::splat(d, 0.5154473099226246);
let v171 = v155 * mul;
let mul = D::F32Vec::splat(d, 0.5310425910897841);
let v172 = v156 * mul;
let mul = D::F32Vec::splat(d, 0.5531038960344445);
let v173 = v157 * mul;
let mul = D::F32Vec::splat(d, 0.5829349682061339);
let v174 = v158 * mul;
let mul = D::F32Vec::splat(d, 0.6225041230356648);
let v175 = v159 * mul;
let mul = D::F32Vec::splat(d, 0.6748083414550057);
let v176 = v160 * mul;
let mul = D::F32Vec::splat(d, 0.7445362710022986);
let v177 = v161 * mul;
let mul = D::F32Vec::splat(d, 0.8393496454155268);
let v178 = v162 * mul;
let mul = D::F32Vec::splat(d, 0.9725682378619608);
let v179 = v163 * mul;
let mul = D::F32Vec::splat(d, 1.1694399334328847);
let v180 = v164 * mul;
let mul = D::F32Vec::splat(d, 1.4841646163141662);
let v181 = v165 * mul;
let mul = D::F32Vec::splat(d, 2.0577810099534108);
let v182 = v166 * mul;
let mul = D::F32Vec::splat(d, 3.4076084184687190);
let v183 = v167 * mul;
let mul = D::F32Vec::splat(d, 10.1900081235480329);
let v184 = v168 * mul;
let v185 = v169 + v184;
let v186 = v170 + v183;
let v187 = v171 + v182;
let v188 = v172 + v181;
let v189 = v173 + v180;
let v190 = v174 + v179;
let v191 = v175 + v178;
let v192 = v176 + v177;
let v193 = v185 + v192;
let v194 = v186 + v191;
let v195 = v187 + v190;
let v196 = v188 + v189;
let v197 = v193 + v196;
let v198 = v194 + v195;
let v199 = v197 + v198;
let v200 = v197 - v198;
let v201 = v193 - v196;
let v202 = v194 - v195;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v203 = v201 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v204 = v202 * mul;
let v205 = v203 + v204;
let v206 = v203 - v204;
let v207 = v205.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v206);
let v208 = v185 - v192;
let v209 = v186 - v191;
let v210 = v187 - v190;
let v211 = v188 - v189;
let mul = D::F32Vec::splat(d, 0.5097955791041592);
let v212 = v208 * mul;
let mul = D::F32Vec::splat(d, 0.6013448869350453);
let v213 = v209 * mul;
let mul = D::F32Vec::splat(d, 0.8999762231364156);
let v214 = v210 * mul;
let mul = D::F32Vec::splat(d, 2.5629154477415055);
let v215 = v211 * mul;
let v216 = v212 + v215;
let v217 = v213 + v214;
let v218 = v216 + v217;
let v219 = v216 - v217;
let v220 = v212 - v215;
let v221 = v213 - v214;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v222 = v220 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v223 = v221 * mul;
let v224 = v222 + v223;
let v225 = v222 - v223;
let v226 = v224.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v225);
let v227 = v218.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v226);
let v228 = v226 + v219;
let v229 = v219 + v225;
let v230 = v169 - v184;
let v231 = v170 - v183;
let v232 = v171 - v182;
let v233 = v172 - v181;
let v234 = v173 - v180;
let v235 = v174 - v179;
let v236 = v175 - v178;
let v237 = v176 - v177;
let mul = D::F32Vec::splat(d, 0.5024192861881557);
let v238 = v230 * mul;
let mul = D::F32Vec::splat(d, 0.5224986149396889);
let v239 = v231 * mul;
let mul = D::F32Vec::splat(d, 0.5669440348163577);
let v240 = v232 * mul;
let mul = D::F32Vec::splat(d, 0.6468217833599901);
let v241 = v233 * mul;
let mul = D::F32Vec::splat(d, 0.7881546234512502);
let v242 = v234 * mul;
let mul = D::F32Vec::splat(d, 1.0606776859903471);
let v243 = v235 * mul;
let mul = D::F32Vec::splat(d, 1.7224470982383342);
let v244 = v236 * mul;
let mul = D::F32Vec::splat(d, 5.1011486186891553);
let v245 = v237 * mul;
let v246 = v238 + v245;
let v247 = v239 + v244;
let v248 = v240 + v243;
let v249 = v241 + v242;
let v250 = v246 + v249;
let v251 = v247 + v248;
let v252 = v250 + v251;
let v253 = v250 - v251;
let v254 = v246 - v249;
let v255 = v247 - v248;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v256 = v254 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v257 = v255 * mul;
let v258 = v256 + v257;
let v259 = v256 - v257;
let v260 = v258.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v259);
let v261 = v238 - v245;
let v262 = v239 - v244;
let v263 = v240 - v243;
let v264 = v241 - v242;
let mul = D::F32Vec::splat(d, 0.5097955791041592);
let v265 = v261 * mul;
let mul = D::F32Vec::splat(d, 0.6013448869350453);
let v266 = v262 * mul;
let mul = D::F32Vec::splat(d, 0.8999762231364156);
let v267 = v263 * mul;
let mul = D::F32Vec::splat(d, 2.5629154477415055);
let v268 = v264 * mul;
let v269 = v265 + v268;
let v270 = v266 + v267;
let v271 = v269 + v270;
let v272 = v269 - v270;
let v273 = v265 - v268;
let v274 = v266 - v267;
let mul = D::F32Vec::splat(d, 0.5411961001461970);
let v275 = v273 * mul;
let mul = D::F32Vec::splat(d, 1.3065629648763764);
let v276 = v274 * mul;
let v277 = v275 + v276;
let v278 = v275 - v276;
let v279 = v277.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v278);
let v280 = v271.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v279);
let v281 = v279 + v272;
let v282 = v272 + v278;
let v283 = v252.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v280);
let v284 = v280 + v260;
let v285 = v260 + v281;
let v286 = v281 + v253;
let v287 = v253 + v282;
let v288 = v282 + v259;
let v289 = v259 + v278;
let v290 = v199.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v283);
let v291 = v283 + v227;
let v292 = v227 + v284;
let v293 = v284 + v207;
let v294 = v207 + v285;
let v295 = v285 + v228;
let v296 = v228 + v286;
let v297 = v286 + v200;
let v298 = v200 + v287;
let v299 = v287 + v229;
let v300 = v229 + v288;
let v301 = v288 + v206;
let v302 = v206 + v289;
let v303 = v289 + v225;
let v304 = v225 + v278;
(
v62 * D::F32Vec::splat(d, 0.031250),
v290 * D::F32Vec::splat(d, 0.031262),
v146 * D::F32Vec::splat(d, 0.031299),
v291 * D::F32Vec::splat(d, 0.031361),
v90 * D::F32Vec::splat(d, 0.031449),
v292 * D::F32Vec::splat(d, 0.031561),
v147 * D::F32Vec::splat(d, 0.031699),
v293 * D::F32Vec::splat(d, 0.031864),
v70 * D::F32Vec::splat(d, 0.032055),
v294 * D::F32Vec::splat(d, 0.032274),
v148 * D::F32Vec::splat(d, 0.032521),
v295 * D::F32Vec::splat(d, 0.032797),
v91 * D::F32Vec::splat(d, 0.033103),
v296 * D::F32Vec::splat(d, 0.033441),
v149 * D::F32Vec::splat(d, 0.033811),
v297 * D::F32Vec::splat(d, 0.034215),
v63 * D::F32Vec::splat(d, 0.034654),
v298 * D::F32Vec::splat(d, 0.035131),
v150 * D::F32Vec::splat(d, 0.035647),
v299 * D::F32Vec::splat(d, 0.036204),
v92 * D::F32Vec::splat(d, 0.036806),
v300 * D::F32Vec::splat(d, 0.037453),
v151 * D::F32Vec::splat(d, 0.038150),
v301 * D::F32Vec::splat(d, 0.038899),
v69 * D::F32Vec::splat(d, 0.039705),
v302 * D::F32Vec::splat(d, 0.040571),
v152 * D::F32Vec::splat(d, 0.041502),
v303 * D::F32Vec::splat(d, 0.042502),
v88 * D::F32Vec::splat(d, 0.043578),
v304 * D::F32Vec::splat(d, 0.044735),
v141 * D::F32Vec::splat(d, 0.045981),
v278 * D::F32Vec::splat(d, 0.047324),
)
}
#[inline(always)]
pub(super) fn do_reinterpreting_dct_32<D: SimdDescriptor>(
d: D,
data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
stride: usize,
) {
assert!(data.len() > 31 * stride);
let mut v0 = D::F32Vec::load_array(d, &data[0 * stride]);
let mut v1 = D::F32Vec::load_array(d, &data[1 * stride]);
let mut v2 = D::F32Vec::load_array(d, &data[2 * stride]);
let mut v3 = D::F32Vec::load_array(d, &data[3 * stride]);
let mut v4 = D::F32Vec::load_array(d, &data[4 * stride]);
let mut v5 = D::F32Vec::load_array(d, &data[5 * stride]);
let mut v6 = D::F32Vec::load_array(d, &data[6 * stride]);
let mut v7 = D::F32Vec::load_array(d, &data[7 * stride]);
let mut v8 = D::F32Vec::load_array(d, &data[8 * stride]);
let mut v9 = D::F32Vec::load_array(d, &data[9 * stride]);
let mut v10 = D::F32Vec::load_array(d, &data[10 * stride]);
let mut v11 = D::F32Vec::load_array(d, &data[11 * stride]);
let mut v12 = D::F32Vec::load_array(d, &data[12 * stride]);
let mut v13 = D::F32Vec::load_array(d, &data[13 * stride]);
let mut v14 = D::F32Vec::load_array(d, &data[14 * stride]);
let mut v15 = D::F32Vec::load_array(d, &data[15 * stride]);
let mut v16 = D::F32Vec::load_array(d, &data[16 * stride]);
let mut v17 = D::F32Vec::load_array(d, &data[17 * stride]);
let mut v18 = D::F32Vec::load_array(d, &data[18 * stride]);
let mut v19 = D::F32Vec::load_array(d, &data[19 * stride]);
let mut v20 = D::F32Vec::load_array(d, &data[20 * stride]);
let mut v21 = D::F32Vec::load_array(d, &data[21 * stride]);
let mut v22 = D::F32Vec::load_array(d, &data[22 * stride]);
let mut v23 = D::F32Vec::load_array(d, &data[23 * stride]);
let mut v24 = D::F32Vec::load_array(d, &data[24 * stride]);
let mut v25 = D::F32Vec::load_array(d, &data[25 * stride]);
let mut v26 = D::F32Vec::load_array(d, &data[26 * stride]);
let mut v27 = D::F32Vec::load_array(d, &data[27 * stride]);
let mut v28 = D::F32Vec::load_array(d, &data[28 * stride]);
let mut v29 = D::F32Vec::load_array(d, &data[29 * stride]);
let mut v30 = D::F32Vec::load_array(d, &data[30 * stride]);
let mut v31 = D::F32Vec::load_array(d, &data[31 * stride]);
(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
) = reinterpreting_dct_32(
d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
);
v0.store_array(&mut data[0 * stride]);
v1.store_array(&mut data[1 * stride]);
v2.store_array(&mut data[2 * stride]);
v3.store_array(&mut data[3 * stride]);
v4.store_array(&mut data[4 * stride]);
v5.store_array(&mut data[5 * stride]);
v6.store_array(&mut data[6 * stride]);
v7.store_array(&mut data[7 * stride]);
v8.store_array(&mut data[8 * stride]);
v9.store_array(&mut data[9 * stride]);
v10.store_array(&mut data[10 * stride]);
v11.store_array(&mut data[11 * stride]);
v12.store_array(&mut data[12 * stride]);
v13.store_array(&mut data[13 * stride]);
v14.store_array(&mut data[14 * stride]);
v15.store_array(&mut data[15 * stride]);
v16.store_array(&mut data[16 * stride]);
v17.store_array(&mut data[17 * stride]);
v18.store_array(&mut data[18 * stride]);
v19.store_array(&mut data[19 * stride]);
v20.store_array(&mut data[20 * stride]);
v21.store_array(&mut data[21 * stride]);
v22.store_array(&mut data[22 * stride]);
v23.store_array(&mut data[23 * stride]);
v24.store_array(&mut data[24 * stride]);
v25.store_array(&mut data[25 * stride]);
v26.store_array(&mut data[26 * stride]);
v27.store_array(&mut data[27 * stride]);
v28.store_array(&mut data[28 * stride]);
v29.store_array(&mut data[29 * stride]);
v30.store_array(&mut data[30 * stride]);
v31.store_array(&mut data[31 * stride]);
}
#[inline(always)]
pub(super) fn do_reinterpreting_dct_32_rowblock<D: SimdDescriptor>(
d: D,
data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
) {
assert!(data.len() >= 32);
const { assert!(32usize.is_multiple_of(D::F32Vec::LEN)) };
let row_stride = 32 / D::F32Vec::LEN;
let mut v0 = D::F32Vec::load_array(
d,
&data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)],
);
let mut v1 = D::F32Vec::load_array(
d,
&data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)],
);
let mut v2 = D::F32Vec::load_array(
d,
&data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)],
);
let mut v3 = D::F32Vec::load_array(
d,
&data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)],
);
let mut v4 = D::F32Vec::load_array(
d,
&data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)],
);
let mut v5 = D::F32Vec::load_array(
d,
&data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)],
);
let mut v6 = D::F32Vec::load_array(
d,
&data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)],
);
let mut v7 = D::F32Vec::load_array(
d,
&data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)],
);
let mut v8 = D::F32Vec::load_array(
d,
&data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)],
);
let mut v9 = D::F32Vec::load_array(
d,
&data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)],
);
let mut v10 = D::F32Vec::load_array(
d,
&data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)],
);
let mut v11 = D::F32Vec::load_array(
d,
&data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)],
);
let mut v12 = D::F32Vec::load_array(
d,
&data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)],
);
let mut v13 = D::F32Vec::load_array(
d,
&data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)],
);
let mut v14 = D::F32Vec::load_array(
d,
&data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)],
);
let mut v15 = D::F32Vec::load_array(
d,
&data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)],
);
let mut v16 = D::F32Vec::load_array(
d,
&data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)],
);
let mut v17 = D::F32Vec::load_array(
d,
&data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)],
);
let mut v18 = D::F32Vec::load_array(
d,
&data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)],
);
let mut v19 = D::F32Vec::load_array(
d,
&data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)],
);
let mut v20 = D::F32Vec::load_array(
d,
&data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)],
);
let mut v21 = D::F32Vec::load_array(
d,
&data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)],
);
let mut v22 = D::F32Vec::load_array(
d,
&data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)],
);
let mut v23 = D::F32Vec::load_array(
d,
&data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)],
);
let mut v24 = D::F32Vec::load_array(
d,
&data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)],
);
let mut v25 = D::F32Vec::load_array(
d,
&data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)],
);
let mut v26 = D::F32Vec::load_array(
d,
&data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)],
);
let mut v27 = D::F32Vec::load_array(
d,
&data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)],
);
let mut v28 = D::F32Vec::load_array(
d,
&data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)],
);
let mut v29 = D::F32Vec::load_array(
d,
&data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)],
);
let mut v30 = D::F32Vec::load_array(
d,
&data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)],
);
let mut v31 = D::F32Vec::load_array(
d,
&data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)],
);
(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
) = reinterpreting_dct_32(
d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
);
v0.store_array(&mut data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)]);
v1.store_array(&mut data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)]);
v2.store_array(&mut data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)]);
v3.store_array(&mut data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)]);
v4.store_array(&mut data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)]);
v5.store_array(&mut data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)]);
v6.store_array(&mut data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)]);
v7.store_array(&mut data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)]);
v8.store_array(&mut data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)]);
v9.store_array(&mut data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)]);
v10.store_array(&mut data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)]);
v11.store_array(&mut data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)]);
v12.store_array(&mut data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)]);
v13.store_array(&mut data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)]);
v14.store_array(&mut data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)]);
v15.store_array(&mut data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)]);
v16.store_array(&mut data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)]);
v17.store_array(&mut data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)]);
v18.store_array(&mut data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)]);
v19.store_array(&mut data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)]);
v20.store_array(&mut data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)]);
v21.store_array(&mut data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)]);
v22.store_array(&mut data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)]);
v23.store_array(&mut data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)]);
v24.store_array(&mut data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)]);
v25.store_array(&mut data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)]);
v26.store_array(&mut data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)]);
v27.store_array(&mut data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)]);
v28.store_array(&mut data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)]);
v29.store_array(&mut data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)]);
v30.store_array(&mut data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)]);
v31.store_array(&mut data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)]);
}
#[inline(always)]
pub(super) fn do_reinterpreting_dct_32_trh<D: SimdDescriptor>(
d: D,
data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
) {
let row_stride = 16 / D::F32Vec::LEN;
assert!(data.len() > 31 * row_stride);
const { assert!(16usize.is_multiple_of(D::F32Vec::LEN)) };
let mut v0 = D::F32Vec::load_array(d, &data[row_stride * 0]);
let mut v1 = D::F32Vec::load_array(d, &data[row_stride * 1]);
let mut v2 = D::F32Vec::load_array(d, &data[row_stride * 2]);
let mut v3 = D::F32Vec::load_array(d, &data[row_stride * 3]);
let mut v4 = D::F32Vec::load_array(d, &data[row_stride * 4]);
let mut v5 = D::F32Vec::load_array(d, &data[row_stride * 5]);
let mut v6 = D::F32Vec::load_array(d, &data[row_stride * 6]);
let mut v7 = D::F32Vec::load_array(d, &data[row_stride * 7]);
let mut v8 = D::F32Vec::load_array(d, &data[row_stride * 8]);
let mut v9 = D::F32Vec::load_array(d, &data[row_stride * 9]);
let mut v10 = D::F32Vec::load_array(d, &data[row_stride * 10]);
let mut v11 = D::F32Vec::load_array(d, &data[row_stride * 11]);
let mut v12 = D::F32Vec::load_array(d, &data[row_stride * 12]);
let mut v13 = D::F32Vec::load_array(d, &data[row_stride * 13]);
let mut v14 = D::F32Vec::load_array(d, &data[row_stride * 14]);
let mut v15 = D::F32Vec::load_array(d, &data[row_stride * 15]);
let mut v16 = D::F32Vec::load_array(d, &data[row_stride * 16]);
let mut v17 = D::F32Vec::load_array(d, &data[row_stride * 17]);
let mut v18 = D::F32Vec::load_array(d, &data[row_stride * 18]);
let mut v19 = D::F32Vec::load_array(d, &data[row_stride * 19]);
let mut v20 = D::F32Vec::load_array(d, &data[row_stride * 20]);
let mut v21 = D::F32Vec::load_array(d, &data[row_stride * 21]);
let mut v22 = D::F32Vec::load_array(d, &data[row_stride * 22]);
let mut v23 = D::F32Vec::load_array(d, &data[row_stride * 23]);
let mut v24 = D::F32Vec::load_array(d, &data[row_stride * 24]);
let mut v25 = D::F32Vec::load_array(d, &data[row_stride * 25]);
let mut v26 = D::F32Vec::load_array(d, &data[row_stride * 26]);
let mut v27 = D::F32Vec::load_array(d, &data[row_stride * 27]);
let mut v28 = D::F32Vec::load_array(d, &data[row_stride * 28]);
let mut v29 = D::F32Vec::load_array(d, &data[row_stride * 29]);
let mut v30 = D::F32Vec::load_array(d, &data[row_stride * 30]);
let mut v31 = D::F32Vec::load_array(d, &data[row_stride * 31]);
(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
) = reinterpreting_dct_32(
d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
);
v0.store_array(&mut data[row_stride * 0]);
v16.store_array(&mut data[row_stride * 1]);
v1.store_array(&mut data[row_stride * 2]);
v17.store_array(&mut data[row_stride * 3]);
v2.store_array(&mut data[row_stride * 4]);
v18.store_array(&mut data[row_stride * 5]);
v3.store_array(&mut data[row_stride * 6]);
v19.store_array(&mut data[row_stride * 7]);
v4.store_array(&mut data[row_stride * 8]);
v20.store_array(&mut data[row_stride * 9]);
v5.store_array(&mut data[row_stride * 10]);
v21.store_array(&mut data[row_stride * 11]);
v6.store_array(&mut data[row_stride * 12]);
v22.store_array(&mut data[row_stride * 13]);
v7.store_array(&mut data[row_stride * 14]);
v23.store_array(&mut data[row_stride * 15]);
v8.store_array(&mut data[row_stride * 16]);
v24.store_array(&mut data[row_stride * 17]);
v9.store_array(&mut data[row_stride * 18]);
v25.store_array(&mut data[row_stride * 19]);
v10.store_array(&mut data[row_stride * 20]);
v26.store_array(&mut data[row_stride * 21]);
v11.store_array(&mut data[row_stride * 22]);
v27.store_array(&mut data[row_stride * 23]);
v12.store_array(&mut data[row_stride * 24]);
v28.store_array(&mut data[row_stride * 25]);
v13.store_array(&mut data[row_stride * 26]);
v29.store_array(&mut data[row_stride * 27]);
v14.store_array(&mut data[row_stride * 28]);
v30.store_array(&mut data[row_stride * 29]);
v15.store_array(&mut data[row_stride * 30]);
v31.store_array(&mut data[row_stride * 31]);
}