reinterpreting_dct32.rs

// Copyright (c) the JPEG XL Project Authors. All rights reserved.

//

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

#![allow(clippy::type_complexity)]

#![allow(clippy::erasing_op)]

#![allow(clippy::identity_op)]

use jxl_simd::{F32SimdVec, SimdDescriptor};

#[allow(clippy::too_many_arguments)]

#[allow(clippy::excessive_precision)]

#[inline(always)]

pub(super) fn reinterpreting_dct_32<D: SimdDescriptor>(

    d: D,

    v0: D::F32Vec,

    v1: D::F32Vec,

    v2: D::F32Vec,

    v3: D::F32Vec,

    v4: D::F32Vec,

    v5: D::F32Vec,

    v6: D::F32Vec,

    v7: D::F32Vec,

    v8: D::F32Vec,

    v9: D::F32Vec,

    v10: D::F32Vec,

    v11: D::F32Vec,

    v12: D::F32Vec,

    v13: D::F32Vec,

    v14: D::F32Vec,

    v15: D::F32Vec,

    v16: D::F32Vec,

    v17: D::F32Vec,

    v18: D::F32Vec,

    v19: D::F32Vec,

    v20: D::F32Vec,

    v21: D::F32Vec,

    v22: D::F32Vec,

    v23: D::F32Vec,

    v24: D::F32Vec,

    v25: D::F32Vec,

    v26: D::F32Vec,

    v27: D::F32Vec,

    v28: D::F32Vec,

    v29: D::F32Vec,

    v30: D::F32Vec,

    v31: D::F32Vec,

) -> (

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

    D::F32Vec,

) {

    let v32 = v0 + v31;

    let v33 = v1 + v30;

    let v34 = v2 + v29;

    let v35 = v3 + v28;

    let v36 = v4 + v27;

    let v37 = v5 + v26;

    let v38 = v6 + v25;

    let v39 = v7 + v24;

    let v40 = v8 + v23;

    let v41 = v9 + v22;

    let v42 = v10 + v21;

    let v43 = v11 + v20;

    let v44 = v12 + v19;

    let v45 = v13 + v18;

    let v46 = v14 + v17;

    let v47 = v15 + v16;

    let v48 = v32 + v47;

    let v49 = v33 + v46;

    let v50 = v34 + v45;

    let v51 = v35 + v44;

    let v52 = v36 + v43;

    let v53 = v37 + v42;

    let v54 = v38 + v41;

    let v55 = v39 + v40;

    let v56 = v48 + v55;

    let v57 = v49 + v54;

    let v58 = v50 + v53;

    let v59 = v51 + v52;

    let v60 = v56 + v59;

    let v61 = v57 + v58;

    let v62 = v60 + v61;

    let v63 = v60 - v61;

    let v64 = v56 - v59;

    let v65 = v57 - v58;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v66 = v64 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v67 = v65 * mul;

    let v68 = v66 + v67;

    let v69 = v66 - v67;

    let v70 = v68.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v69);

    let v71 = v48 - v55;

    let v72 = v49 - v54;

    let v73 = v50 - v53;

    let v74 = v51 - v52;

    let mul = D::F32Vec::splat(d, 0.5097955791041592);

    let v75 = v71 * mul;

    let mul = D::F32Vec::splat(d, 0.6013448869350453);

    let v76 = v72 * mul;

    let mul = D::F32Vec::splat(d, 0.8999762231364156);

    let v77 = v73 * mul;

    let mul = D::F32Vec::splat(d, 2.5629154477415055);

    let v78 = v74 * mul;

    let v79 = v75 + v78;

    let v80 = v76 + v77;

    let v81 = v79 + v80;

    let v82 = v79 - v80;

    let v83 = v75 - v78;

    let v84 = v76 - v77;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v85 = v83 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v86 = v84 * mul;

    let v87 = v85 + v86;

    let v88 = v85 - v86;

    let v89 = v87.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v88);

    let v90 = v81.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v89);

    let v91 = v89 + v82;

    let v92 = v82 + v88;

    let v93 = v32 - v47;

    let v94 = v33 - v46;

    let v95 = v34 - v45;

    let v96 = v35 - v44;

    let v97 = v36 - v43;

    let v98 = v37 - v42;

    let v99 = v38 - v41;

    let v100 = v39 - v40;

    let mul = D::F32Vec::splat(d, 0.5024192861881557);

    let v101 = v93 * mul;

    let mul = D::F32Vec::splat(d, 0.5224986149396889);

    let v102 = v94 * mul;

    let mul = D::F32Vec::splat(d, 0.5669440348163577);

    let v103 = v95 * mul;

    let mul = D::F32Vec::splat(d, 0.6468217833599901);

    let v104 = v96 * mul;

    let mul = D::F32Vec::splat(d, 0.7881546234512502);

    let v105 = v97 * mul;

    let mul = D::F32Vec::splat(d, 1.0606776859903471);

    let v106 = v98 * mul;

    let mul = D::F32Vec::splat(d, 1.7224470982383342);

    let v107 = v99 * mul;

    let mul = D::F32Vec::splat(d, 5.1011486186891553);

    let v108 = v100 * mul;

    let v109 = v101 + v108;

    let v110 = v102 + v107;

    let v111 = v103 + v106;

    let v112 = v104 + v105;

    let v113 = v109 + v112;

    let v114 = v110 + v111;

    let v115 = v113 + v114;

    let v116 = v113 - v114;

    let v117 = v109 - v112;

    let v118 = v110 - v111;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v119 = v117 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v120 = v118 * mul;

    let v121 = v119 + v120;

    let v122 = v119 - v120;

    let v123 = v121.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v122);

    let v124 = v101 - v108;

    let v125 = v102 - v107;

    let v126 = v103 - v106;

    let v127 = v104 - v105;

    let mul = D::F32Vec::splat(d, 0.5097955791041592);

    let v128 = v124 * mul;

    let mul = D::F32Vec::splat(d, 0.6013448869350453);

    let v129 = v125 * mul;

    let mul = D::F32Vec::splat(d, 0.8999762231364156);

    let v130 = v126 * mul;

    let mul = D::F32Vec::splat(d, 2.5629154477415055);

    let v131 = v127 * mul;

    let v132 = v128 + v131;

    let v133 = v129 + v130;

    let v134 = v132 + v133;

    let v135 = v132 - v133;

    let v136 = v128 - v131;

    let v137 = v129 - v130;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v138 = v136 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v139 = v137 * mul;

    let v140 = v138 + v139;

    let v141 = v138 - v139;

    let v142 = v140.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v141);

    let v143 = v134.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v142);

    let v144 = v142 + v135;

    let v145 = v135 + v141;

    let v146 = v115.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v143);

    let v147 = v143 + v123;

    let v148 = v123 + v144;

    let v149 = v144 + v116;

    let v150 = v116 + v145;

    let v151 = v145 + v122;

    let v152 = v122 + v141;

    let v153 = v0 - v31;

    let v154 = v1 - v30;

    let v155 = v2 - v29;

    let v156 = v3 - v28;

    let v157 = v4 - v27;

    let v158 = v5 - v26;

    let v159 = v6 - v25;

    let v160 = v7 - v24;

    let v161 = v8 - v23;

    let v162 = v9 - v22;

    let v163 = v10 - v21;

    let v164 = v11 - v20;

    let v165 = v12 - v19;

    let v166 = v13 - v18;

    let v167 = v14 - v17;

    let v168 = v15 - v16;

    let mul = D::F32Vec::splat(d, 0.5006029982351963);

    let v169 = v153 * mul;

    let mul = D::F32Vec::splat(d, 0.5054709598975436);

    let v170 = v154 * mul;

    let mul = D::F32Vec::splat(d, 0.5154473099226246);

    let v171 = v155 * mul;

    let mul = D::F32Vec::splat(d, 0.5310425910897841);

    let v172 = v156 * mul;

    let mul = D::F32Vec::splat(d, 0.5531038960344445);

    let v173 = v157 * mul;

    let mul = D::F32Vec::splat(d, 0.5829349682061339);

    let v174 = v158 * mul;

    let mul = D::F32Vec::splat(d, 0.6225041230356648);

    let v175 = v159 * mul;

    let mul = D::F32Vec::splat(d, 0.6748083414550057);

    let v176 = v160 * mul;

    let mul = D::F32Vec::splat(d, 0.7445362710022986);

    let v177 = v161 * mul;

    let mul = D::F32Vec::splat(d, 0.8393496454155268);

    let v178 = v162 * mul;

    let mul = D::F32Vec::splat(d, 0.9725682378619608);

    let v179 = v163 * mul;

    let mul = D::F32Vec::splat(d, 1.1694399334328847);

    let v180 = v164 * mul;

    let mul = D::F32Vec::splat(d, 1.4841646163141662);

    let v181 = v165 * mul;

    let mul = D::F32Vec::splat(d, 2.0577810099534108);

    let v182 = v166 * mul;

    let mul = D::F32Vec::splat(d, 3.4076084184687190);

    let v183 = v167 * mul;

    let mul = D::F32Vec::splat(d, 10.1900081235480329);

    let v184 = v168 * mul;

    let v185 = v169 + v184;

    let v186 = v170 + v183;

    let v187 = v171 + v182;

    let v188 = v172 + v181;

    let v189 = v173 + v180;

    let v190 = v174 + v179;

    let v191 = v175 + v178;

    let v192 = v176 + v177;

    let v193 = v185 + v192;

    let v194 = v186 + v191;

    let v195 = v187 + v190;

    let v196 = v188 + v189;

    let v197 = v193 + v196;

    let v198 = v194 + v195;

    let v199 = v197 + v198;

    let v200 = v197 - v198;

    let v201 = v193 - v196;

    let v202 = v194 - v195;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v203 = v201 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v204 = v202 * mul;

    let v205 = v203 + v204;

    let v206 = v203 - v204;

    let v207 = v205.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v206);

    let v208 = v185 - v192;

    let v209 = v186 - v191;

    let v210 = v187 - v190;

    let v211 = v188 - v189;

    let mul = D::F32Vec::splat(d, 0.5097955791041592);

    let v212 = v208 * mul;

    let mul = D::F32Vec::splat(d, 0.6013448869350453);

    let v213 = v209 * mul;

    let mul = D::F32Vec::splat(d, 0.8999762231364156);

    let v214 = v210 * mul;

    let mul = D::F32Vec::splat(d, 2.5629154477415055);

    let v215 = v211 * mul;

    let v216 = v212 + v215;

    let v217 = v213 + v214;

    let v218 = v216 + v217;

    let v219 = v216 - v217;

    let v220 = v212 - v215;

    let v221 = v213 - v214;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v222 = v220 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v223 = v221 * mul;

    let v224 = v222 + v223;

    let v225 = v222 - v223;

    let v226 = v224.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v225);

    let v227 = v218.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v226);

    let v228 = v226 + v219;

    let v229 = v219 + v225;

    let v230 = v169 - v184;

    let v231 = v170 - v183;

    let v232 = v171 - v182;

    let v233 = v172 - v181;

    let v234 = v173 - v180;

    let v235 = v174 - v179;

    let v236 = v175 - v178;

    let v237 = v176 - v177;

    let mul = D::F32Vec::splat(d, 0.5024192861881557);

    let v238 = v230 * mul;

    let mul = D::F32Vec::splat(d, 0.5224986149396889);

    let v239 = v231 * mul;

    let mul = D::F32Vec::splat(d, 0.5669440348163577);

    let v240 = v232 * mul;

    let mul = D::F32Vec::splat(d, 0.6468217833599901);

    let v241 = v233 * mul;

    let mul = D::F32Vec::splat(d, 0.7881546234512502);

    let v242 = v234 * mul;

    let mul = D::F32Vec::splat(d, 1.0606776859903471);

    let v243 = v235 * mul;

    let mul = D::F32Vec::splat(d, 1.7224470982383342);

    let v244 = v236 * mul;

    let mul = D::F32Vec::splat(d, 5.1011486186891553);

    let v245 = v237 * mul;

    let v246 = v238 + v245;

    let v247 = v239 + v244;

    let v248 = v240 + v243;

    let v249 = v241 + v242;

    let v250 = v246 + v249;

    let v251 = v247 + v248;

    let v252 = v250 + v251;

    let v253 = v250 - v251;

    let v254 = v246 - v249;

    let v255 = v247 - v248;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v256 = v254 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v257 = v255 * mul;

    let v258 = v256 + v257;

    let v259 = v256 - v257;

    let v260 = v258.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v259);

    let v261 = v238 - v245;

    let v262 = v239 - v244;

    let v263 = v240 - v243;

    let v264 = v241 - v242;

    let mul = D::F32Vec::splat(d, 0.5097955791041592);

    let v265 = v261 * mul;

    let mul = D::F32Vec::splat(d, 0.6013448869350453);

    let v266 = v262 * mul;

    let mul = D::F32Vec::splat(d, 0.8999762231364156);

    let v267 = v263 * mul;

    let mul = D::F32Vec::splat(d, 2.5629154477415055);

    let v268 = v264 * mul;

    let v269 = v265 + v268;

    let v270 = v266 + v267;

    let v271 = v269 + v270;

    let v272 = v269 - v270;

    let v273 = v265 - v268;

    let v274 = v266 - v267;

    let mul = D::F32Vec::splat(d, 0.5411961001461970);

    let v275 = v273 * mul;

    let mul = D::F32Vec::splat(d, 1.3065629648763764);

    let v276 = v274 * mul;

    let v277 = v275 + v276;

    let v278 = v275 - v276;

    let v279 = v277.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v278);

    let v280 = v271.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v279);

    let v281 = v279 + v272;

    let v282 = v272 + v278;

    let v283 = v252.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v280);

    let v284 = v280 + v260;

    let v285 = v260 + v281;

    let v286 = v281 + v253;

    let v287 = v253 + v282;

    let v288 = v282 + v259;

    let v289 = v259 + v278;

    let v290 = v199.mul_add(D::F32Vec::splat(d, std::f32::consts::SQRT_2), v283);

    let v291 = v283 + v227;

    let v292 = v227 + v284;

    let v293 = v284 + v207;

    let v294 = v207 + v285;

    let v295 = v285 + v228;

    let v296 = v228 + v286;

    let v297 = v286 + v200;

    let v298 = v200 + v287;

    let v299 = v287 + v229;

    let v300 = v229 + v288;

    let v301 = v288 + v206;

    let v302 = v206 + v289;

    let v303 = v289 + v225;

    let v304 = v225 + v278;

        v62 * D::F32Vec::splat(d, 0.031250),

        v290 * D::F32Vec::splat(d, 0.031262),

        v146 * D::F32Vec::splat(d, 0.031299),

        v291 * D::F32Vec::splat(d, 0.031361),

        v90 * D::F32Vec::splat(d, 0.031449),

        v292 * D::F32Vec::splat(d, 0.031561),

        v147 * D::F32Vec::splat(d, 0.031699),

        v293 * D::F32Vec::splat(d, 0.031864),

        v70 * D::F32Vec::splat(d, 0.032055),

        v294 * D::F32Vec::splat(d, 0.032274),

        v148 * D::F32Vec::splat(d, 0.032521),

        v295 * D::F32Vec::splat(d, 0.032797),

        v91 * D::F32Vec::splat(d, 0.033103),

        v296 * D::F32Vec::splat(d, 0.033441),

        v149 * D::F32Vec::splat(d, 0.033811),

        v297 * D::F32Vec::splat(d, 0.034215),

        v63 * D::F32Vec::splat(d, 0.034654),

        v298 * D::F32Vec::splat(d, 0.035131),

        v150 * D::F32Vec::splat(d, 0.035647),

        v299 * D::F32Vec::splat(d, 0.036204),

        v92 * D::F32Vec::splat(d, 0.036806),

        v300 * D::F32Vec::splat(d, 0.037453),

        v151 * D::F32Vec::splat(d, 0.038150),

        v301 * D::F32Vec::splat(d, 0.038899),

        v69 * D::F32Vec::splat(d, 0.039705),

        v302 * D::F32Vec::splat(d, 0.040571),

        v152 * D::F32Vec::splat(d, 0.041502),

        v303 * D::F32Vec::splat(d, 0.042502),

        v88 * D::F32Vec::splat(d, 0.043578),

        v304 * D::F32Vec::splat(d, 0.044735),

        v141 * D::F32Vec::splat(d, 0.045981),

        v278 * D::F32Vec::splat(d, 0.047324),

#[inline(always)]

pub(super) fn do_reinterpreting_dct_32<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    stride: usize,

) {

    assert!(data.len() > 31 * stride);

    let mut v0 = D::F32Vec::load_array(d, &data[0 * stride]);

    let mut v1 = D::F32Vec::load_array(d, &data[1 * stride]);

    let mut v2 = D::F32Vec::load_array(d, &data[2 * stride]);

    let mut v3 = D::F32Vec::load_array(d, &data[3 * stride]);

    let mut v4 = D::F32Vec::load_array(d, &data[4 * stride]);

    let mut v5 = D::F32Vec::load_array(d, &data[5 * stride]);

    let mut v6 = D::F32Vec::load_array(d, &data[6 * stride]);

    let mut v7 = D::F32Vec::load_array(d, &data[7 * stride]);

    let mut v8 = D::F32Vec::load_array(d, &data[8 * stride]);

    let mut v9 = D::F32Vec::load_array(d, &data[9 * stride]);

    let mut v10 = D::F32Vec::load_array(d, &data[10 * stride]);

    let mut v11 = D::F32Vec::load_array(d, &data[11 * stride]);

    let mut v12 = D::F32Vec::load_array(d, &data[12 * stride]);

    let mut v13 = D::F32Vec::load_array(d, &data[13 * stride]);

    let mut v14 = D::F32Vec::load_array(d, &data[14 * stride]);

    let mut v15 = D::F32Vec::load_array(d, &data[15 * stride]);

    let mut v16 = D::F32Vec::load_array(d, &data[16 * stride]);

    let mut v17 = D::F32Vec::load_array(d, &data[17 * stride]);

    let mut v18 = D::F32Vec::load_array(d, &data[18 * stride]);

    let mut v19 = D::F32Vec::load_array(d, &data[19 * stride]);

    let mut v20 = D::F32Vec::load_array(d, &data[20 * stride]);

    let mut v21 = D::F32Vec::load_array(d, &data[21 * stride]);

    let mut v22 = D::F32Vec::load_array(d, &data[22 * stride]);

    let mut v23 = D::F32Vec::load_array(d, &data[23 * stride]);

    let mut v24 = D::F32Vec::load_array(d, &data[24 * stride]);

    let mut v25 = D::F32Vec::load_array(d, &data[25 * stride]);

    let mut v26 = D::F32Vec::load_array(d, &data[26 * stride]);

    let mut v27 = D::F32Vec::load_array(d, &data[27 * stride]);

    let mut v28 = D::F32Vec::load_array(d, &data[28 * stride]);

    let mut v29 = D::F32Vec::load_array(d, &data[29 * stride]);

    let mut v30 = D::F32Vec::load_array(d, &data[30 * stride]);

    let mut v31 = D::F32Vec::load_array(d, &data[31 * stride]);

        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,

        v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

    ) = reinterpreting_dct_32(

        d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,

        v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

);

    v0.store_array(&mut data[0 * stride]);

    v1.store_array(&mut data[1 * stride]);

    v2.store_array(&mut data[2 * stride]);

    v3.store_array(&mut data[3 * stride]);

    v4.store_array(&mut data[4 * stride]);

    v5.store_array(&mut data[5 * stride]);

    v6.store_array(&mut data[6 * stride]);

    v7.store_array(&mut data[7 * stride]);

    v8.store_array(&mut data[8 * stride]);

    v9.store_array(&mut data[9 * stride]);

    v10.store_array(&mut data[10 * stride]);

    v11.store_array(&mut data[11 * stride]);

    v12.store_array(&mut data[12 * stride]);

    v13.store_array(&mut data[13 * stride]);

    v14.store_array(&mut data[14 * stride]);

    v15.store_array(&mut data[15 * stride]);

    v16.store_array(&mut data[16 * stride]);

    v17.store_array(&mut data[17 * stride]);

    v18.store_array(&mut data[18 * stride]);

    v19.store_array(&mut data[19 * stride]);

    v20.store_array(&mut data[20 * stride]);

    v21.store_array(&mut data[21 * stride]);

    v22.store_array(&mut data[22 * stride]);

    v23.store_array(&mut data[23 * stride]);

    v24.store_array(&mut data[24 * stride]);

    v25.store_array(&mut data[25 * stride]);

    v26.store_array(&mut data[26 * stride]);

    v27.store_array(&mut data[27 * stride]);

    v28.store_array(&mut data[28 * stride]);

    v29.store_array(&mut data[29 * stride]);

    v30.store_array(&mut data[30 * stride]);

    v31.store_array(&mut data[31 * stride]);

#[inline(always)]

pub(super) fn do_reinterpreting_dct_32_rowblock<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

) {

    assert!(data.len() >= 32);

    const { assert!(32usize.is_multiple_of(D::F32Vec::LEN)) };

    let row_stride = 32 / D::F32Vec::LEN;

    let mut v0 = D::F32Vec::load_array(

d,

        &data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)],

);

    let mut v1 = D::F32Vec::load_array(

d,

        &data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)],

);

    let mut v2 = D::F32Vec::load_array(

d,

        &data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)],

);

    let mut v3 = D::F32Vec::load_array(

d,

        &data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)],

);

    let mut v4 = D::F32Vec::load_array(

d,

        &data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)],

);

    let mut v5 = D::F32Vec::load_array(

d,

        &data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)],

);

    let mut v6 = D::F32Vec::load_array(

d,

        &data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)],

);

    let mut v7 = D::F32Vec::load_array(

d,

        &data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)],

);

    let mut v8 = D::F32Vec::load_array(

d,

        &data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)],

);

    let mut v9 = D::F32Vec::load_array(

d,

        &data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)],

);

    let mut v10 = D::F32Vec::load_array(

d,

        &data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)],

);

    let mut v11 = D::F32Vec::load_array(

d,

        &data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)],

);

    let mut v12 = D::F32Vec::load_array(

d,

        &data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)],

);

    let mut v13 = D::F32Vec::load_array(

d,

        &data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)],

);

    let mut v14 = D::F32Vec::load_array(

d,

        &data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)],

);

    let mut v15 = D::F32Vec::load_array(

d,

        &data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)],

);

    let mut v16 = D::F32Vec::load_array(

d,

        &data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)],

);

    let mut v17 = D::F32Vec::load_array(

d,

        &data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)],

);

    let mut v18 = D::F32Vec::load_array(

d,

        &data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)],

);

    let mut v19 = D::F32Vec::load_array(

d,

        &data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)],

);

    let mut v20 = D::F32Vec::load_array(

d,

        &data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)],

);

    let mut v21 = D::F32Vec::load_array(

d,

        &data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)],

);

    let mut v22 = D::F32Vec::load_array(

d,

        &data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)],

);

    let mut v23 = D::F32Vec::load_array(

d,

        &data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)],

);

    let mut v24 = D::F32Vec::load_array(

d,

        &data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)],

);

    let mut v25 = D::F32Vec::load_array(

d,

        &data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)],

);

    let mut v26 = D::F32Vec::load_array(

d,

        &data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)],

);

    let mut v27 = D::F32Vec::load_array(

d,

        &data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)],

);

    let mut v28 = D::F32Vec::load_array(

d,

        &data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)],

);

    let mut v29 = D::F32Vec::load_array(

d,

        &data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)],

);

    let mut v30 = D::F32Vec::load_array(

d,

        &data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)],

);

    let mut v31 = D::F32Vec::load_array(

d,

        &data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)],

);

        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,

        v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

    ) = reinterpreting_dct_32(

        d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,

        v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

);

    v0.store_array(&mut data[row_stride * (0 % D::F32Vec::LEN) + (0 / D::F32Vec::LEN)]);

    v1.store_array(&mut data[row_stride * (1 % D::F32Vec::LEN) + (1 / D::F32Vec::LEN)]);

    v2.store_array(&mut data[row_stride * (2 % D::F32Vec::LEN) + (2 / D::F32Vec::LEN)]);

    v3.store_array(&mut data[row_stride * (3 % D::F32Vec::LEN) + (3 / D::F32Vec::LEN)]);

    v4.store_array(&mut data[row_stride * (4 % D::F32Vec::LEN) + (4 / D::F32Vec::LEN)]);

    v5.store_array(&mut data[row_stride * (5 % D::F32Vec::LEN) + (5 / D::F32Vec::LEN)]);

    v6.store_array(&mut data[row_stride * (6 % D::F32Vec::LEN) + (6 / D::F32Vec::LEN)]);

    v7.store_array(&mut data[row_stride * (7 % D::F32Vec::LEN) + (7 / D::F32Vec::LEN)]);

    v8.store_array(&mut data[row_stride * (8 % D::F32Vec::LEN) + (8 / D::F32Vec::LEN)]);

    v9.store_array(&mut data[row_stride * (9 % D::F32Vec::LEN) + (9 / D::F32Vec::LEN)]);

    v10.store_array(&mut data[row_stride * (10 % D::F32Vec::LEN) + (10 / D::F32Vec::LEN)]);

    v11.store_array(&mut data[row_stride * (11 % D::F32Vec::LEN) + (11 / D::F32Vec::LEN)]);

    v12.store_array(&mut data[row_stride * (12 % D::F32Vec::LEN) + (12 / D::F32Vec::LEN)]);

    v13.store_array(&mut data[row_stride * (13 % D::F32Vec::LEN) + (13 / D::F32Vec::LEN)]);

    v14.store_array(&mut data[row_stride * (14 % D::F32Vec::LEN) + (14 / D::F32Vec::LEN)]);

    v15.store_array(&mut data[row_stride * (15 % D::F32Vec::LEN) + (15 / D::F32Vec::LEN)]);

    v16.store_array(&mut data[row_stride * (16 % D::F32Vec::LEN) + (16 / D::F32Vec::LEN)]);

    v17.store_array(&mut data[row_stride * (17 % D::F32Vec::LEN) + (17 / D::F32Vec::LEN)]);

    v18.store_array(&mut data[row_stride * (18 % D::F32Vec::LEN) + (18 / D::F32Vec::LEN)]);

    v19.store_array(&mut data[row_stride * (19 % D::F32Vec::LEN) + (19 / D::F32Vec::LEN)]);

    v20.store_array(&mut data[row_stride * (20 % D::F32Vec::LEN) + (20 / D::F32Vec::LEN)]);

    v21.store_array(&mut data[row_stride * (21 % D::F32Vec::LEN) + (21 / D::F32Vec::LEN)]);

    v22.store_array(&mut data[row_stride * (22 % D::F32Vec::LEN) + (22 / D::F32Vec::LEN)]);

    v23.store_array(&mut data[row_stride * (23 % D::F32Vec::LEN) + (23 / D::F32Vec::LEN)]);

    v24.store_array(&mut data[row_stride * (24 % D::F32Vec::LEN) + (24 / D::F32Vec::LEN)]);

    v25.store_array(&mut data[row_stride * (25 % D::F32Vec::LEN) + (25 / D::F32Vec::LEN)]);

    v26.store_array(&mut data[row_stride * (26 % D::F32Vec::LEN) + (26 / D::F32Vec::LEN)]);

    v27.store_array(&mut data[row_stride * (27 % D::F32Vec::LEN) + (27 / D::F32Vec::LEN)]);

    v28.store_array(&mut data[row_stride * (28 % D::F32Vec::LEN) + (28 / D::F32Vec::LEN)]);

    v29.store_array(&mut data[row_stride * (29 % D::F32Vec::LEN) + (29 / D::F32Vec::LEN)]);

    v30.store_array(&mut data[row_stride * (30 % D::F32Vec::LEN) + (30 / D::F32Vec::LEN)]);

    v31.store_array(&mut data[row_stride * (31 % D::F32Vec::LEN) + (31 / D::F32Vec::LEN)]);

#[inline(always)]

pub(super) fn do_reinterpreting_dct_32_trh<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

) {

    let row_stride = 16 / D::F32Vec::LEN;

    assert!(data.len() > 31 * row_stride);

    const { assert!(16usize.is_multiple_of(D::F32Vec::LEN)) };

    let mut v0 = D::F32Vec::load_array(d, &data[row_stride * 0]);

    let mut v1 = D::F32Vec::load_array(d, &data[row_stride * 1]);

    let mut v2 = D::F32Vec::load_array(d, &data[row_stride * 2]);

    let mut v3 = D::F32Vec::load_array(d, &data[row_stride * 3]);

    let mut v4 = D::F32Vec::load_array(d, &data[row_stride * 4]);

    let mut v5 = D::F32Vec::load_array(d, &data[row_stride * 5]);

    let mut v6 = D::F32Vec::load_array(d, &data[row_stride * 6]);

    let mut v7 = D::F32Vec::load_array(d, &data[row_stride * 7]);

    let mut v8 = D::F32Vec::load_array(d, &data[row_stride * 8]);

    let mut v9 = D::F32Vec::load_array(d, &data[row_stride * 9]);

    let mut v10 = D::F32Vec::load_array(d, &data[row_stride * 10]);

    let mut v11 = D::F32Vec::load_array(d, &data[row_stride * 11]);

    let mut v12 = D::F32Vec::load_array(d, &data[row_stride * 12]);

    let mut v13 = D::F32Vec::load_array(d, &data[row_stride * 13]);

    let mut v14 = D::F32Vec::load_array(d, &data[row_stride * 14]);

    let mut v15 = D::F32Vec::load_array(d, &data[row_stride * 15]);

    let mut v16 = D::F32Vec::load_array(d, &data[row_stride * 16]);

    let mut v17 = D::F32Vec::load_array(d, &data[row_stride * 17]);

    let mut v18 = D::F32Vec::load_array(d, &data[row_stride * 18]);

    let mut v19 = D::F32Vec::load_array(d, &data[row_stride * 19]);

    let mut v20 = D::F32Vec::load_array(d, &data[row_stride * 20]);

    let mut v21 = D::F32Vec::load_array(d, &data[row_stride * 21]);

    let mut v22 = D::F32Vec::load_array(d, &data[row_stride * 22]);

    let mut v23 = D::F32Vec::load_array(d, &data[row_stride * 23]);

    let mut v24 = D::F32Vec::load_array(d, &data[row_stride * 24]);

    let mut v25 = D::F32Vec::load_array(d, &data[row_stride * 25]);

    let mut v26 = D::F32Vec::load_array(d, &data[row_stride * 26]);

    let mut v27 = D::F32Vec::load_array(d, &data[row_stride * 27]);

    let mut v28 = D::F32Vec::load_array(d, &data[row_stride * 28]);

    let mut v29 = D::F32Vec::load_array(d, &data[row_stride * 29]);

    let mut v30 = D::F32Vec::load_array(d, &data[row_stride * 30]);

    let mut v31 = D::F32Vec::load_array(d, &data[row_stride * 31]);

        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,

        v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

    ) = reinterpreting_dct_32(

        d, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,

        v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,

);

    v0.store_array(&mut data[row_stride * 0]);

    v16.store_array(&mut data[row_stride * 1]);

    v1.store_array(&mut data[row_stride * 2]);

    v17.store_array(&mut data[row_stride * 3]);

    v2.store_array(&mut data[row_stride * 4]);

    v18.store_array(&mut data[row_stride * 5]);

    v3.store_array(&mut data[row_stride * 6]);

    v19.store_array(&mut data[row_stride * 7]);

    v4.store_array(&mut data[row_stride * 8]);

    v20.store_array(&mut data[row_stride * 9]);

    v5.store_array(&mut data[row_stride * 10]);

    v21.store_array(&mut data[row_stride * 11]);

    v6.store_array(&mut data[row_stride * 12]);

    v22.store_array(&mut data[row_stride * 13]);

    v7.store_array(&mut data[row_stride * 14]);

    v23.store_array(&mut data[row_stride * 15]);

    v8.store_array(&mut data[row_stride * 16]);

    v24.store_array(&mut data[row_stride * 17]);

    v9.store_array(&mut data[row_stride * 18]);

    v25.store_array(&mut data[row_stride * 19]);

    v10.store_array(&mut data[row_stride * 20]);

    v26.store_array(&mut data[row_stride * 21]);

    v11.store_array(&mut data[row_stride * 22]);

    v27.store_array(&mut data[row_stride * 23]);

    v12.store_array(&mut data[row_stride * 24]);

    v28.store_array(&mut data[row_stride * 25]);

    v13.store_array(&mut data[row_stride * 26]);

    v29.store_array(&mut data[row_stride * 27]);

    v14.store_array(&mut data[row_stride * 28]);

    v30.store_array(&mut data[row_stride * 29]);

    v15.store_array(&mut data[row_stride * 30]);

    v31.store_array(&mut data[row_stride * 31]);

Revision control

Copy as Markdown

Other Tools