idct_large.rs - mozsearch

comm-central/third_party/rust/jxl_transforms/src/idct_large.rs

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

// Copyright (c) the JPEG XL Project Authors. All rights reserved.

//

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

// This file contains a generic implementation of large (>32x32) 2d IDCTs.

// They are not implemented in the same way as smaller 2d IDCTs to reduce code size.

#![allow(clippy::excessive_precision)]

use std::f32::consts::SQRT_2;

use jxl_simd::F32SimdVec;

use jxl_simd::SimdDescriptor;

use crate::idct_32;

const WC_WEIGHTS_64: [f32; 32] = [

    0.500150636020651,

    0.5013584524464084,

    0.5037887256810443,

    0.5074711720725553,

    0.5124514794082247,

    0.5187927131053328,

    0.52657731515427,

    0.535909816907992,

    0.5469204379855088,

    0.5597698129470802,

    0.57465518403266,

    0.5918185358574165,

    0.6115573478825099,

    0.6342389366884031,

    0.6603198078137061,

    0.6903721282002123,

    0.7251205223771985,

    0.7654941649730891,

    0.8127020908144905,

    0.8683447152233481,

    0.9345835970364075,

    1.0144082649970547,

    1.1120716205797176,

    1.233832737976571,

    1.3892939586328277,

    1.5939722833856311,

    1.8746759800084078,

    2.282050068005162,

    2.924628428158216,

    4.084611078129248,

    6.796750711673633,

    20.373878167231453,

];

const WC_WEIGHTS_128: [f32; 64] = [

    0.5000376519155477,

    0.5003390374428216,

    0.5009427176380873,

    0.5018505174842379,

    0.5030651913013697,

    0.5045904432216454,

    0.5064309549285542,

    0.5085924210498143,

    0.5110815927066812,

    0.5139063298475396,

    0.5170756631334912,

    0.5205998663018917,

    0.524490540114724,

    0.5287607092074876,

    0.5334249333971333,

    0.538499435291984,

    0.5440022463817783,

    0.549953374183236,

    0.5563749934898856,

    0.5632916653417023,

    0.5707305880121454,

    0.5787218851348208,

    0.5872989370937893,

    0.5964987630244563,

    0.606362462272146,

    0.6169357260050706,

    0.6282694319707711,

    0.6404203382416639,

    0.6534518953751283,

    0.6674352009263413,

    0.6824501259764195,

    0.6985866506472291,

    0.7159464549705746,

    0.7346448236478627,

    0.7548129391165311,

    0.776600658233963,

    0.8001798956216941,

    0.8257487738627852,

    0.8535367510066064,

    0.8838110045596234,

    0.9168844461846523,

    0.9531258743921193,

    0.9929729612675466,

    1.036949040910389,

    1.0856850642580145,

    1.1399486751015042,

    1.2006832557294167,

    1.2690611716991191,

    1.346557628206286,

    1.4350550884414341,

    1.5369941008524954,

    1.6555965242641195,

    1.7952052190778898,

    1.961817848571166,

    2.163957818751979,

    2.4141600002500763,

    2.7316450287739396,

    3.147462191781909,

    3.7152427383269746,

    4.5362909369693565,

    5.827688377844654,

    8.153848602466814,

    13.58429025728446,

    40.744688103351834,

];

const WC_WEIGHTS_256: [f32; 128] = [

    0.5000094125358878,

    0.500084723455784,

    0.5002354020255269,

    0.5004615618093246,

    0.5007633734146156,

    0.5011410648064231,

    0.5015949217281668,

    0.502125288230386,

    0.5027325673091954,

    0.5034172216566842,

    0.5041797745258774,

    0.5050208107132756,

    0.5059409776624396,

    0.5069409866925212,

    0.5080216143561264,

    0.509183703931388,

    0.5104281670536573,

    0.5117559854927805,

    0.5131682130825206,

    0.5146659778093218,

    0.516250484068288,

    0.5179230150949777,

    0.5196849355823947,

    0.5215376944933958,

    0.5234828280796439,

    0.52552196311921,

    0.5276568203859896,

    0.5298892183652453,

    0.5322210772308335,

    0.5346544231010253,

    0.537191392591309,

    0.5398342376841637,

    0.5425853309375497,

    0.545447171055775,

    0.5484223888484947,

    0.551513753605893,

    0.554724179920619,

    0.5580567349898085,

    0.5615146464335654,

    0.5651013106696203,

    0.5688203018875696,

    0.5726753816701664,

    0.5766705093136241,

    0.5808098529038624,

    0.5850978012111273,

    0.58953897647151,

    0.5941382481306648,

    0.5989007476325463,

    0.6038318843443582,

    0.6089373627182432,

    0.614223200800649,

    0.6196957502119484,

    0.6253617177319102,

    0.6312281886412079,

    0.6373026519855411,

    0.6435930279473415,

    0.6501076975307724,

    0.6568555347890955,

    0.6638459418498757,

    0.6710888870233562,

    0.6785949463131795,

    0.6863753486870501,

    0.6944420255086364,

    0.7028076645818034,

    0.7114857693151208,

    0.7204907235796304,

    0.7298378629074134,

    0.7395435527641373,

    0.749625274727372,

    0.7601017215162176,

    0.7709929019493761,

    0.7823202570613161,

    0.7941067887834509,

    0.8063772028037925,

    0.8191580674598145,

    0.83247799080191,

    0.8463678182968619,

    0.860860854031955,

    0.8759931087426972,

    0.8918035785352535,

    0.9083345588266809,

    0.9256319988042384,

    0.9437459026371479,

    0.962730784794803,

    0.9826461881778968,

    1.0035572754078206,

    1.0255355056139732,

    1.048659411496106,

    1.0730154944316674,

    1.0986992590905857,

    1.1258164135986009,

    1.1544842669978943,

    1.184833362908442,

    1.217009397314603,

    1.2511754798461228,

    1.287514812536712,

    1.326233878832723,

    1.3675662599582539,

    1.411777227500661,

    1.459169302866857,

    1.5100890297227016,

    1.5649352798258847,

    1.6241695131835794,

    1.6883285509131505,

    1.7580406092704062,

    1.8340456094306077,

    1.9172211551275689,

    2.0086161135167564,

    2.1094945286246385,

    2.22139377701127,

    2.346202662531156,

    2.486267909203593,

    2.644541877144861,

    2.824791402350551,

    3.0318994541759925,

    3.2723115884254845,

    3.5547153325075804,

    3.891107790700307,

    4.298537526449054,

    4.802076008665048,

    5.440166215091329,

    6.274908408039339,

    7.413566756422303,

    9.058751453879703,

    11.644627325175037,

    16.300023088031555,

    27.163977662448232,

    81.48784219222516,

];

#[inline(always)]

fn idct_impl_inner<D: SimdDescriptor>(d: D, data: &mut [D::F32Vec], scratch: &mut [D::F32Vec]) {

    let n = data.len();

    assert!(scratch.len() >= n);

    if n == 32 {

        d.call(

            #[inline(always)]

            |_| {

                    data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],

                    data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],

                    data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],

                    data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],

                ) = idct_32(

                    d, data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],

                    data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],

                    data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],

                    data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],

},

);

        return;

    let wc_weights = match n {

        64 => &WC_WEIGHTS_64[..],

        128 => &WC_WEIGHTS_128[..],

        256 => &WC_WEIGHTS_256[..],

        _ => unreachable!("invalid large-dct size: {n}"),

};

    assert_eq!(wc_weights.len(), n / 2);

    let (first_half, second_half) = scratch[..n].split_at_mut(n / 2);

    for i in 0..n / 2 {

        first_half[i] = data[i * 2];

        second_half[i] = data[2 * i + 1];

    d.call(

        #[inline(always)]

        |_| idct_impl_inner(d, first_half, data),

);

    for i in (1..n / 2).rev() {

        second_half[i] += second_half[i - 1];

    second_half[0] *= D::F32Vec::splat(d, SQRT_2);

    d.call(

        #[inline(always)]

        |_| idct_impl_inner(d, second_half, data),

);

    for i in 0..n / 2 {

        let mul = D::F32Vec::splat(d, wc_weights[i]);

        data[i] = second_half[i].mul_add(mul, first_half[i]);

        data[n - i - 1] = second_half[i].neg_mul_add(mul, first_half[i]);

#[inline(always)]

fn do_idct<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    stride: usize,

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    let n = storage.len();

    assert!((n - 1) * stride < data.len());

    for i in 0..n {

        storage[i] = D::F32Vec::load_array(d, &data[i * stride]);

    d.call(

        #[inline(always)]

        |d| idct_impl_inner(d, storage, scratch),

);

    for i in 0..n {

        storage[i].store_array(&mut data[i * stride]);

#[inline(always)]

fn do_idct_rowblock<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    let n = storage.len();

    assert!(n.is_multiple_of(D::F32Vec::LEN));

    assert!(data.len() >= n);

    let row_stride = n / D::F32Vec::LEN;

    for i in 0..n {

        storage[i] = D::F32Vec::load_array(

d,

            &data[row_stride * (i % D::F32Vec::LEN) + (i / D::F32Vec::LEN)],

);

    d.call(

        #[inline(always)]

        |d| idct_impl_inner(d, storage, scratch),

);

    for i in 0..n {

        storage[i].store_array(&mut data[row_stride * (i % D::F32Vec::LEN) + (i / D::F32Vec::LEN)]);

#[inline(always)]

fn do_idct_trh<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    let n = storage.len();

    assert!(n.is_multiple_of(D::F32Vec::LEN));

    assert!(data.len() >= n);

    let row_stride = n / (2 * D::F32Vec::LEN);

    for i in 0..n / 2 {

        storage[i] = D::F32Vec::load_array(d, &data[row_stride * 2 * i]);

        storage[i + n / 2] = D::F32Vec::load_array(d, &data[row_stride * (2 * i + 1)]);

    d.call(

        #[inline(always)]

        |d| idct_impl_inner(d, storage, scratch),

);

    for i in 0..n {

        storage[i].store_array(&mut data[row_stride * i]);

#[inline(always)]

fn idct2d_square<D: SimdDescriptor>(

    d: D,

    data: &mut [f32],

    n: usize,

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    let data = D::F32Vec::make_array_slice_mut(data);

    let chunks = n / D::F32Vec::LEN;

    // Step 1: do column-DCTs on the first K columns.

    for i in 0..chunks {

        d.call(

            #[inline(always)]

            |_| do_idct(d, &mut data[i..], chunks, &mut storage[..n], scratch),

);

    // Step 2: do column-DCTs on groups of K columns, transposing KxK blocks and

    // swapping them in their final place as we do so.

    for i in 0..chunks {

        D::F32Vec::transpose_square(d, &mut data[i * n + i..], chunks);

        for j in i + 1..chunks {

            D::F32Vec::transpose_square(d, &mut data[j * n + i..], chunks);

            D::F32Vec::transpose_square(d, &mut data[i * n + j..], chunks);

            for k in 0..D::F32Vec::LEN {

                data.swap(i * n + j + k * chunks, j * n + i + k * chunks);

        d.call(

            #[inline(always)]

            |_| do_idct(d, &mut data[i..], chunks, &mut storage[..n], scratch),

);

#[inline(always)]

fn idct2d_wide<D: SimdDescriptor>(

    d: D,

    data: &mut [f32],

    c: usize,

    r: usize,

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    assert!(r < c);

    let data = D::F32Vec::make_array_slice_mut(data);

    let column_chunks = c / D::F32Vec::LEN;

    let row_chunks = r / D::F32Vec::LEN;

    // Step 1: do rowblock-DCTs on the first K rows, transposing KxK blocks first.

    for i in 0..row_chunks {

        for j in 0..column_chunks {

            D::F32Vec::transpose_square(d, &mut data[i * c + j..], column_chunks);

        d.call(

            #[inline(always)]

            |_| do_idct_rowblock(d, &mut data[i * c..], &mut storage[..c], scratch),

);

    // Step 2: do column-DCTs on groups of K columns, transposing KxK blocks back.

    for i in 0..column_chunks {

        for j in 0..row_chunks {

            D::F32Vec::transpose_square(d, &mut data[j * c + i..], column_chunks);

        d.call(

            #[inline(always)]

            |_| do_idct(d, &mut data[i..], column_chunks, &mut storage[..r], scratch),

);

#[inline(always)]

fn idct2d_thin<D: SimdDescriptor>(

    d: D,

    data: &mut [f32],

    c: usize,

    r: usize,

    storage: &mut [D::F32Vec],

    scratch: &mut [D::F32Vec],

) {

    assert!(r > c);

    let data = D::F32Vec::make_array_slice_mut(data);

    let column_chunks = c / D::F32Vec::LEN;

    let row_chunks = r / D::F32Vec::LEN;

    // Note: input is transposed, so in the beginning it has ROWS *columns* and COLS *rows*.

    // Step 1: do column-DCTs on columns.

    for i in 0..row_chunks {

        d.call(

            #[inline(always)]

            |_| do_idct(d, &mut data[i..], row_chunks, &mut storage[..c], scratch),

);

    // Step 2: Incrementally transpose each square sub-block of the matrix, then do a column-IDCT which also completes the transpose.

    for i in 0..column_chunks {

        let tr_block = |data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray], i, j, l| {

            D::F32Vec::transpose_square(d, &mut data[i * r + j + l * column_chunks..], row_chunks)

};

        (0..2).for_each(|l| tr_block(data, i, i, l));

        for j in i + 1..column_chunks {

            (0..2).for_each(|l| tr_block(data, i, j, l));

            (0..2).for_each(|l| tr_block(data, j, i, l));

            for l in 0..2 {

                for k in 0..D::F32Vec::LEN {

                    data.swap(

                        i * r + j + k * row_chunks + l * column_chunks,

                        j * r + i + k * row_chunks + l * column_chunks,

);

        d.call(

            #[inline(always)]

            |_| do_idct_trh(d, &mut data[i..], &mut storage[..r], scratch),

);

macro_rules! make_idct2d {

    ($name: ident, $h: literal, $w: literal) => {

        pub fn $name<D: SimdDescriptor>(d: D, data: &mut [f32]) {

            const L: usize = if $w < $h { $h } else { $w };

            let mut storage = [D::F32Vec::zero(d); L];

            let mut scratch = [D::F32Vec::zero(d); L];

            if $w == $h {

                return d.call(

                    #[inline(always)]

                    |_| idct2d_square(d, data, $w, &mut storage, &mut scratch),

);

            if $w > $h {

                return d.call(

                    #[inline(always)]

                    |_| idct2d_wide(d, data, $w, $h, &mut storage, &mut scratch),

);

            return d.call(

                #[inline(always)]

                |_| idct2d_thin(d, data, $w, $h, &mut storage, &mut scratch),

);

};

make_idct2d!(idct2d_32_64, 32, 64);

make_idct2d!(idct2d_64_32, 64, 32);

make_idct2d!(idct2d_64_64, 64, 64);

make_idct2d!(idct2d_64_128, 64, 128);

make_idct2d!(idct2d_128_64, 128, 64);

make_idct2d!(idct2d_128_128, 128, 128);

make_idct2d!(idct2d_128_256, 128, 256);

make_idct2d!(idct2d_256_128, 256, 128);

make_idct2d!(idct2d_256_256, 256, 256);

#[cfg(test)]

#[inline(always)]

pub fn do_idct_64<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    stride: usize,

) {

    let mut storage = [D::F32Vec::zero(d); 64];

    let mut scratch = [D::F32Vec::zero(d); 64];

    d.call(

        #[inline(always)]

        |_| {

            do_idct(d, data, stride, &mut storage, &mut scratch);

},

);

#[cfg(test)]

#[inline(always)]

pub fn do_idct_128<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    stride: usize,

) {

    let mut storage = [D::F32Vec::zero(d); 128];

    let mut scratch = [D::F32Vec::zero(d); 128];

    d.call(

        #[inline(always)]

        |_| {

            do_idct(d, data, stride, &mut storage, &mut scratch);

},

);

#[cfg(test)]

#[inline(always)]

pub fn do_idct_256<D: SimdDescriptor>(

    d: D,

    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],

    stride: usize,

) {

    let mut storage = [D::F32Vec::zero(d); 256];

    let mut scratch = [D::F32Vec::zero(d); 256];

    d.call(

        #[inline(always)]

        |_| {

            do_idct(d, data, stride, &mut storage, &mut scratch);

},

);