gen_reinterpreting_dct2d.py

Enable keyboard shortcuts

#!/usr/bin/env python3

# Copyright (c) the JPEG XL Project Authors. All rights reserved.

# Use of this source code is governed by a BSD-style

# license that can be found in the LICENSE file.

HEADER = """\

// Copyright (c) the JPEG XL Project Authors. All rights reserved.

//

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

"""

SIZES = [

    (1, 2),

    (2, 1),

    (1, 4),

    (4, 1),

    (2, 2),

    (2, 4),

    (4, 2),

    (4, 4),

    (4, 8),

    (8, 4),

    (8, 8),

    (8, 16),

    (16, 8),

    (16, 16),

    (16, 32),

    (32, 16),

    (32, 32),

print(HEADER)

print("use jxl_simd::{SimdDescriptor, F32SimdVec};")

print("use crate::*;")

def impl_r_less_c(ROWS, COLS):

    print("let data = D::F32Vec::make_array_slice_mut(data);")

    print("let column_chunks = %d / D::F32Vec::LEN;" % COLS)

    print("let row_chunks = %d / D::F32Vec::LEN;" % ROWS)

    # Step 1: do rowblock-DCTs on the first K rows, transposing KxK blocks first.

    print("for i in 0..row_chunks {")

    print("for j in 0..column_chunks {")

    print(

        "D::F32Vec::transpose_square(d, &mut data[i * %d + j..], column_chunks);" % COLS

    print("}")

    print("do_reinterpreting_dct_%d_rowblock(d, &mut data[i * %d..]);" % (COLS, COLS))

    print("}")

    # Step 2: do column-DCTs on groups of K columns, transposing KxK blocks back.

    print("for i in 0..column_chunks {")

    print("for j in 0..row_chunks {")

    print(

        "D::F32Vec::transpose_square(d, &mut data[j * %d + i..], column_chunks);" % COLS

    print("}")

    print("do_reinterpreting_dct_%d(d, &mut data[i..], column_chunks)" % ROWS)

    print("}")

def impl_square(N):

    print("let data = D::F32Vec::make_array_slice_mut(data);")

    print("let chunks = %d / D::F32Vec::LEN;" % N)

    # Step 1: do column-DCTs on the first K columns.

    print("for i in 0..chunks {")

    print("do_reinterpreting_dct_%d(d, &mut data[i..], chunks);" % N)

    print("}")

    # Step 2: do column-DCTs on groups of K columns, transposing KxK blocks and

    # swapping them in their final place as we do so.

    print("for i in 0..chunks {")

    print("D::F32Vec::transpose_square(d, &mut data[i * %d + i..], chunks);" % N)

    print("for j in i+1..chunks {")

    print("D::F32Vec::transpose_square(d, &mut data[j * %d + i..], chunks);" % N)

    print("D::F32Vec::transpose_square(d, &mut data[i * %d + j..], chunks);" % N)

    print("for k in 0..D::F32Vec::LEN {")

    print("data.swap(i * %d + j + k * chunks, j * %d + i + k * chunks);" % (N, N))

    print("}")

    print("}")

    print("do_reinterpreting_dct_%d(d, &mut data[i..], chunks);" % N)

    print("}")

def impl_r_greater_c(ROWS, COLS):

    ratio = ROWS / COLS

    print("let data = D::F32Vec::make_array_slice_mut(data);")

    print("let column_chunks = %d / D::F32Vec::LEN;" % COLS)

    print("let row_chunks = %d / D::F32Vec::LEN;" % ROWS)

    # Step 1: do column-DCTs on columns, which does the first part of the full matrix transpose.

    print("for i in 0..column_chunks {")

    print("do_reinterpreting_dct_%d_trh(d, &mut data[i..]);" % ROWS)

    print("}")

    # Step 2: Incrementally transpose each square sub-block of the matrix, then do a column-IDCT.

    print("for l in 0..%d {" % ratio)

    print("for i in 0..column_chunks {")

    print(

        "let tr_block = |data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray], i, j, l| {"

    print(

        "D::F32Vec::transpose_square(d, &mut data[i * %d + j + l * column_chunks..], row_chunks)};"

        % ROWS

    print("tr_block(data, i, i, l);")

    print("for j in i+1..column_chunks {")

    print("tr_block(data, i, j, l);")

    print("tr_block(data, j, i, l);")

    print("for k in 0..D::F32Vec::LEN {")

    print(

        "data.swap(i * %d + j + k * row_chunks + l * column_chunks, j * %d + i + k * row_chunks + l * column_chunks);"

        % (ROWS, ROWS)

    print("}")

    print("}")

    print(

        "do_reinterpreting_dct_%d(d, &mut data[i + l * column_chunks..], row_chunks);"

        % COLS

    print("}")

    print("}")

for ROWS, COLS in SIZES:

    print()

    SZ = ROWS * COLS

    L = min(ROWS, COLS)

    S = max(ROWS, COLS)

    print("#[inline(always)]")

    print(

        "fn reinterpreting_dct2d_%d_%d_impl<D: SimdDescriptor>(d: D, data: &mut[f32], output: &mut[f32]) {"

        % (ROWS, COLS)

    print('assert_eq!(data.len(), %d, "Data length mismatch");' % SZ)

    print("assert!(output.len() > %d);" % ((L - 1) * S * 8 + S - 1))

    print("const { assert!(%dusize.is_multiple_of(D::F32Vec::LEN)) };" % ROWS)

    print("const { assert!(%dusize.is_multiple_of(D::F32Vec::LEN)) };" % COLS)

    print("{")

    if ROWS == 1 or COLS == 1:

        print("let data = D::F32Vec::make_array_slice_mut(data);")

        print("do_reinterpreting_dct_%d(d, data, 1);" % S)

    elif ROWS < COLS:

        impl_r_less_c(ROWS, COLS)

    elif ROWS == COLS:

        impl_square(ROWS)

    else:

        impl_r_greater_c(ROWS, COLS)

    print("}")

    print("for y in 0..%d {" % L)

    print("for x in 0..%d {" % S)

    print("output[y * %d + x] = data[y * %d + x];" % (S * 8, S))

    print("}")

    print("}")

    print("}")

# Wrappers to reduce SIMD size.

for ROWS, COLS in SIZES:

    print()

    print("#[inline(always)]")

    print("#[allow(unused_variables)]")

    print(

        "pub fn reinterpreting_dct2d_%d_%d<D: SimdDescriptor>(d: D, data: &mut[f32], output: &mut[f32]) {"

        % (ROWS, COLS)

    if ROWS < 4 or COLS < 4:

        descriptor = "jxl_simd::ScalarDescriptor"

        print("let d = %s::new().unwrap();" % descriptor)

    elif ROWS < 8 or COLS < 8:

        descriptor = "D::Descriptor128"

        print("let d = d.maybe_downgrade_128bit();")

    elif ROWS < 16 or COLS < 16:

        descriptor = "D::Descriptor256"

        print("let d = d.maybe_downgrade_256bit();")

    else:

        descriptor = "D"

    print("reinterpreting_dct2d_%d_%d_impl(d, data, output)" % (ROWS, COLS))

    print("}")