Merge remote-tracking branch 'remotes/kvz_github/master' into Fix-monochrome

# Conflicts:
#	.gitlab-ci.yml
#	build/kvazaar_lib/kvazaar_lib.vcxproj.filters
#	src/cfg.c
#	src/encoder.h
#	src/kvazaar.h
#	src/rdo.c
This commit is contained in:
Joose Sainio 2021-04-23 10:56:50 +03:00
commit 1aaa95601c
46 changed files with 2174 additions and 1235 deletions

View file

@ -33,7 +33,7 @@ test-asan:
# variables:
# CFLAGS: '-fsanitize=thread'
# # Temporarily suppress known errors or false positives.
# TSAN_OPTIONS: 'suppressions=/builds/TIE/ultravideo/kvazaar/tests/tsan_suppressions.txt'
# TSAN_OPTIONS: 'suppressions=/builds/cs/ultravideo/kvazaar/tests/tsan_suppressions.txt'
test-ubsan:
<<: *test-template

View file

@ -117,6 +117,7 @@ Options:
bits, lambda, distortion, and qp for each ctu.
These are meant for debugging and are not
written unless the prefix is defined.
Video structure:
-q, --qp <integer> : Quantization parameter [22]
-p, --period <integer> : Period of intra pictures [64]
@ -253,6 +254,16 @@ Compression tools:
- sensitive: Terminate even earlier.
--fast-residual-cost <int> : Skip CABAC cost for residual coefficients
when QP is below the limit. [0]
--fast-coeff-table <string> : Read custom weights for residual
coefficients from a file instead of using
defaults [default]
--fast-rd-sampling : Enable learning data sampling for fast coefficient
table generation
--fastrd-accuracy-check : Evaluate the accuracy of fast coefficient
prediction
--fastrd-outdir : Directory to which to output sampled data or accuracy
data, into <fastrd-outdir>/0.txt to 50.txt, one file
for each QP that blocks were estimated on
--(no-)intra-rdo-et : Check intra modes in rdo stage only until
a zero coefficient CU is found. [disabled]
--(no-)early-skip : Try to find skip cu from merge candidates.

View file

@ -1,8 +1,3 @@
# Only the whitelisted branches get built, regardless of build config
branches:
only:
- master
# Email the author if their commit either failed to build or fixed a failed build
# good -> bad, bad -> bad, bad -> good but not good -> good
notifications:
@ -37,13 +32,16 @@ configuration:
- Release
# Build with multiple compilers / build suites
image: Visual Studio 2015
environment:
matrix:
- platform: Win32
- platform: x64
- MSYSTEM: MINGW32
- MSYSTEM: MINGW64
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
platform: Win32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
platform: x64
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
MSYSTEM: MINGW32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
MSYSTEM: MINGW64
for:
-

View file

@ -164,6 +164,7 @@
<ClCompile Include="..\..\src\nal.c" />
<ClCompile Include="..\..\src\rate_control.c" />
<ClCompile Include="..\..\src\rdo.c" />
<ClCompile Include="..\..\src\fast_coeff_cost.c" />
<ClCompile Include="..\..\src\sao.c" />
<ClCompile Include="..\..\src\scalinglist.c" />
<ClCompile Include="..\..\src\search.c" />
@ -290,6 +291,7 @@
<ClInclude Include="..\..\src\nal.h" />
<ClInclude Include="..\..\src\rate_control.h" />
<ClInclude Include="..\..\src\rdo.h" />
<ClInclude Include="..\..\src\fast_coeff_cost.h" />
<ClInclude Include="..\..\src\sao.h" />
<ClInclude Include="..\..\src\scalinglist.h" />
<ClInclude Include="..\..\src\search.h" />

View file

@ -174,6 +174,12 @@
<ClCompile Include="..\..\src\rdo.c">
<Filter>Compression</Filter>
</ClCompile>
<ClCompile Include="..\..\src\fast_coeff_cost.c">
<Filter>Compression</Filter>
</ClCompile>
<ClCompile Include="..\..\src\inter.c">
<Filter>Reconstruction</Filter>
</ClCompile>
<ClCompile Include="..\..\src\intra.c">
<Filter>Reconstruction</Filter>
</ClCompile>
@ -342,6 +348,9 @@
<ClInclude Include="..\..\src\rdo.h">
<Filter>Compression</Filter>
</ClInclude>
<ClInclude Include="..\..\src\fast_coeff_cost.h">
<Filter>Compression</Filter>
</ClInclude>
<ClInclude Include="..\..\src\strategies\strategies-common.h">
<Filter>Optimization\strategies</Filter>
</ClInclude>

View file

@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
#
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
ver_major=6
ver_minor=3
ver_minor=5
ver_release=0
# Prevents configure from adding a lot of defines to the CFLAGS

View file

@ -1,4 +1,4 @@
.TH KVAZAAR "1" "September 2020" "kvazaar v2.0.0" "User Commands"
.TH KVAZAAR "1" "January 2021" "kvazaar v2.0.0" "User Commands"
.SH NAME
kvazaar \- open source HEVC encoder
.SH SYNOPSIS
@ -106,6 +106,7 @@ A prefix used for stats files that include
bits, lambda, distortion, and qp for each ctu.
These are meant for debugging and are not
written unless the prefix is defined.
.SS "Video structure:"
.TP
\fB\-q\fR, \fB\-\-qp <integer>
@ -326,6 +327,24 @@ Motion estimation termination [on]
Skip CABAC cost for residual coefficients
when QP is below the limit. [0]
.TP
\fB\-\-fast\-coeff\-table <string>
Read custom weights for residual
coefficients from a file instead of using
defaults [default]
.TP
\fB\-\-fast\-rd\-sampling
Enable learning data sampling for fast coefficient
table generation
.TP
\fB\-\-fastrd\-accuracy\-check
Evaluate the accuracy of fast coefficient
prediction
.TP
\fB\-\-fastrd\-outdir
Directory to which to output sampled data or accuracy
data, into <fastrd\-outdir>/0.txt to 50.txt, one file
for each QP that blocks were estimated on
.TP
\fB\-\-(no\-)intra\-rdo\-et
Check intra modes in rdo stage only until
a zero coefficient CU is found. [disabled]

14
examples/README.md Normal file
View file

@ -0,0 +1,14 @@
Examples
========
Examples of external files for use with Kvazaar.
## Region of interest (roi) files
A simple text file can be used with the `--roi` switch to setup regions of interest for encoding.
Header row of the file will tell how many regions the encoded frames are divided (columns, rows).
The header must be followed by a data row with number entries equal to columns * rows.
The data row will tell the encoder which delta QP value will be assigned to each region.
The included example file will split frames into four regions with the top regions having a delta QP of +5
```
2 2
5 5 0 0
```

View file

@ -0,0 +1,51 @@
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.046152 4.874163 3.830968 6.617950
0.040648 4.920004 3.922710 6.572261
0.033854 4.982197 4.021474 6.518219
0.027073 5.056451 4.082557 6.471514
0.021064 5.125763 4.113825 6.436425
0.016605 5.170554 4.119091 6.423091
0.012953 5.196849 4.128659 6.422746
0.010218 5.194947 4.166336 6.431305
0.007970 5.177114 4.217242 6.429468
0.006442 5.138598 4.275070 6.396064
0.005184 5.093265 4.337876 6.352651
0.004134 5.046189 4.413434 6.310742
0.003239 5.001028 4.492965 6.264692
0.002689 4.959881 4.569652 6.198468
0.002280 4.920991 4.642861 6.123074
0.001940 4.886799 4.709124 6.049688
0.001631 4.858057 4.767754 5.986929
0.001409 4.839546 4.813134 5.951025
0.001223 4.823649 4.856675 5.933274
0.001055 4.806288 4.904500 5.940060
0.000899 4.789201 4.950018 5.955955
0.000781 4.776673 4.981798 5.982144
0.000683 4.766721 5.006732 6.019175
0.000603 4.757364 5.030649 6.081959
0.000529 4.746016 5.059187 6.158720
0.000460 4.729670 5.100437 6.254217
0.000397 4.711187 5.150631 6.364452
0.000345 4.692304 5.213098 6.506122
0.000300 4.674471 5.279962 6.667672
0.000264 4.660182 5.342776 6.836979
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093
0.000237 4.649543 5.392507 6.977093

2
examples/roi.txt Normal file
View file

@ -0,0 +1,2 @@
2 2
5 5 0 0

View file

@ -0,0 +1,35 @@
To extract the block costs, build Kvazaar as usual, and edit relevant
parameters in the beginning of extract_rdcosts.py and run_filter.py, most
importantly the number of cores and the set of video sequences you want to
encode to extract costs. Run extract_rdcosts.py, it will use Kvazaar to encode
each sequence and extract the costs measured there for the quantized blocks.
The costs are stored compressed and sorted by block QP, in the following
format:
Size (B) | Description
----------+------------
4 | size: Coeff group size, in int16's
4 | ccc: Coeff group's coding cost
size * 2 | coeffs: Coeff group data
To analyze the costs by running a linear regression over them, build the two
tools using:
$ gcc filter_rdcosts.c -O2 -o frcosts_matrix
$ gcc ols_2ndpart.c -O2 -o ols_2ndpart
Then run the regression in parallel by running run_filter.py. The reason to do
it this way is because the data is stored compressed, so there is no way to
mmap it in Matlab/Octave/something; the data sets are absolutely huge (larger
than reasonable amounts of RAM in a decent workstation), but this way we can
store the data compressed and process it in O(1) memory complexity, so it can
be done as widely parallelized as you have CPU cores. The result files each
consist of 4 numbers, which represent an approximate linear solution to the
corresponding set of costs: the price in bits of a coefficient whose absolute
value is a) 0, b) 1, c) 2, d) 3 or higher.
After that, run rdcost_do_avg.py. It will calculate a per-QP average of the
costs over the set of the sequences having been run (ie. for each QP, take the
results for that QP for each sequence, and calculate their average). This data
is what you can use to fill in the default_fast_coeff_cost_wts table in
src/fast_coeff_cost.h.

4
rdcost-weight-tool/build.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/sh
gcc -O2 filter_rdcosts.c -o frcosts_matrix
gcc -O2 ols_2ndpart.c -o ols_2ndpart

View file

@ -0,0 +1,166 @@
#!/usr/bin/env python3
import glob
import gzip
import os
import subprocess
import threading
import time
# Where logs and sampled data will wind up, and where the sequences are read.
# Do note that the sequences variable is supposed to be a tuple, because you
# could have multiple sets of sequences.
logdir = "/tmp/rdcost/logs"
ofdir = "/tmp/rdcost/data"
sequences = ("/opt/test_seqs/custom_seqs/*/*.yuv",)
# Note that n_kvazaars * len(dest_qps) has to be less than the max number of
# fd's that a process can have (check it out: ulimit -a, likely 1024)
smt_threads = 8 # Kinda lazy, but just match this to your cpu
n_kvz_threads = 1 # How many threads each kvz instance is running?
n_kvazaars = smt_threads // n_kvz_threads
# You likely will not need to change anything below this line
kvz_srcdir = lambda path: os.path.join(
os.path.dirname(
os.path.dirname(
os.path.realpath(__file__)
)
), "src", path)
dest_qps = tuple(range(51))
base_qps = tuple(range(12, 43))
kvzargs = [kvz_srcdir("kvazaar"), "--threads", str(n_kvz_threads), "--preset=ultrafast", "--fastrd-sampling", "--fast-residual-cost=0"]
kvzenv = {"LD_LIBRARY_PATH": kvz_srcdir(".libs/")}
class MultiPipeGZOutManager:
pipe_fn_template = "%02i.txt"
gzout_fn_template = "%02i.txt.gz"
def __init__(self, odpath, dest_qps):
self.odpath = odpath
self.dest_qps = dest_qps
self.pipe_fns = []
self.gzout_fns = []
for qp in dest_qps:
pipe_fn = os.path.join(self.odpath, self.pipe_fn_template % qp)
gzout_fn = os.path.join(self.odpath, self.gzout_fn_template % qp)
self.pipe_fns.append(pipe_fn)
self.gzout_fns.append(gzout_fn)
def __enter__(self):
os.makedirs(self.odpath, exist_ok=True)
for pipe_fn in self.pipe_fns:
try:
os.unlink(pipe_fn)
except FileNotFoundError:
pass
os.mkfifo(pipe_fn)
return self
def __exit__(self, *_):
for pipe_fn in self.pipe_fns:
os.unlink(pipe_fn)
def items(self):
for pipe_fn, gzout_fn in zip(self.pipe_fns, self.gzout_fns):
yield (pipe_fn, gzout_fn)
class MTSafeIterable:
def __init__(self, iterable):
self.lock = threading.Lock()
self.iterable = iterable
def __iter__(self):
return self
def __next__(self):
with self.lock:
return next(self.iterable)
def combinations(xi, yi):
for x in xi:
for y in yi:
yield (x, y)
def chain(lol):
for l in lol:
for i in l:
yield i
# Would've used Popen with gzip, but "gzip [fifo]" with an unconnected fifo
# will detect the situation and not block, but just consider it an empty
# file. Don't like it when tools outsmart their user..
def do_gzip(in_fn, out_fn):
BLOCK_SZ = 65536
PRINT_MULT = 1024
with open(in_fn, "rb") as inf, gzip.open(out_fn, "wb") as outf:
num_read = 0
print_next_thres = BLOCK_SZ * PRINT_MULT
while True:
block = inf.read(BLOCK_SZ)
num_read += len(block)
if (num_read >= print_next_thres):
print(" read %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
print_next_thres += BLOCK_SZ * PRINT_MULT
if (len(block) == 0):
break
outf.write(block)
print(" finished %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
def run_job(job):
ifpath, qp = job
ifname = os.path.basename(ifpath)
jobname = "%s-qp%i" % (ifname, qp)
hevcname = "%s.hevc" % jobname
logname = "%s.log" % jobname
odname = jobname
hevcpath = os.path.join("/tmp", hevcname)
logpath = os.path.join(logdir, logname)
odpath = os.path.join(ofdir, odname)
my_kvzargs = kvzargs + ["-i", ifpath,
"--qp", str(qp),
"-o", hevcpath,
"--fastrd-outdir", odpath]
with open(logpath, "w") as lf:
with MultiPipeGZOutManager(odpath, dest_qps) as pipes_and_outputs:
gzip_threads = []
for pipe_fn, out_fn in pipes_and_outputs.items():
gzip_thread = threading.Thread(target=do_gzip, args=(pipe_fn, out_fn))
gzip_thread.start()
gzip_threads.append(gzip_thread)
kvz = subprocess.Popen(my_kvzargs, env=kvzenv, stderr=lf)
kvz.wait()
def threadfunc(joblist):
for job in joblist:
run_job(job)
def main():
assert(isinstance(sequences, tuple))
for d in (logdir, ofdir):
os.makedirs(d, exist_ok=True)
jobs = combinations(chain(map(glob.glob, sequences)), base_qps)
joblist = MTSafeIterable(jobs)
threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_kvazaars)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if (__name__ == "__main__"):
main()

View file

@ -0,0 +1,134 @@
#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define BUFSZ (64 * 64 * sizeof(uint16_t))
#define NUM_COEFF_BUCKETS (4)
#define NUM_OTHER_BUCKETS (0)
#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
#define MAX_COEFF_BUCKET ((NUM_COEFF_BUCKETS) - 1)
#define clz(x) __builtin_clz(x)
#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
void print_coeffs(const int16_t *buf, uint32_t size, uint32_t ccc)
{
uint32_t i;
printf("Buf size %u, ccc %u\n", size, ccc);
for (i = 0; i < size; i++)
printf("%i ", buf[i]);
printf("\n");
}
void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs, uint16_t *excess)
{
*excess = 0;
uint32_t i;
for (i = 0; i < size; i++) {
int16_t curr = buf[i];
int16_t is_signed = curr >> 15;
*num_signs += (is_signed & 1);
uint16_t abs = (curr ^ is_signed) - is_signed;
if (abs > MAX_COEFF_BUCKET) {
*excess += abs - MAX_COEFF_BUCKET;
abs = MAX_COEFF_BUCKET;
}
buckets[abs]++;
}
}
void print_buckets(const uint64_t *buckets, uint64_t num_signs)
{
uint32_t i;
for (i = 0; i < NUM_COEFF_BUCKETS; i++)
printf("%3u: %lu\n", i, buckets[i]);
printf("Signs: %lu\n", num_signs);
}
void update_matrix(const uint64_t *buckets, uint64_t *mat)
{
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
int curr_pos = y * NUM_TOTAL_BUCKETS + x;
mat[curr_pos] += buckets[x] * buckets[y];
}
}
}
static inline int is_power_of_two(uint32_t u)
{
return (u & (u - 1)) == 0;
}
int process_rdcosts(FILE *in, FILE *out)
{
void *buf = malloc(BUFSZ);
uint32_t *u32buf = (uint32_t *)buf;
int16_t *i16buf = (int16_t *)buf;
int rv = 0;
float weights[NUM_TOTAL_BUCKETS] = {0.0f};
uint64_t mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0};
while (!feof(in)) {
uint32_t size, ccc, size_sqrt;
uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
uint64_t cg_num_signs = 0;
uint16_t excess = 0;
size_t n_read;
n_read = fread(buf, sizeof(uint32_t), 2, in);
size = u32buf[0];
ccc = u32buf[1];
// Can't rely on feof() alone when reading from a pipe that might only get
// closed long after the last data has been poured in
if (n_read == 0) {
break;
}
if (feof(in) || n_read < 2) {
fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
rv = 1;
goto out;
}
if (!is_power_of_two(size)) {
fprintf(stderr, "Errorneous block size %u\n", size);
rv = 1;
goto out;
}
size_sqrt = 1 << (ilog2(size) >> 1);
n_read = fread(buf, sizeof(int16_t), size, in);
if (n_read != size) {
fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
rv = 1;
goto out;
}
count_coeffs(i16buf, size, cg_buckets, &cg_num_signs, &excess);
update_matrix(cg_buckets, mat);
}
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
int curr_pos = y * NUM_TOTAL_BUCKETS + x;
printf("%lu ", mat[curr_pos]);
}
printf("\n");
}
fflush(stdout);
out:
free(buf);
return rv;
}
int main(int ar, char **av)
{
return process_rdcosts(stdin, stdout);
}

View file

@ -0,0 +1,3 @@
A = dlmread("/dev/stdin");
B = inv(A);
dlmwrite("/dev/stdout", B, " ");

View file

@ -0,0 +1,132 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#define BUFSZ (64 * 64 * sizeof(uint16_t))
#define NUM_COEFF_BUCKETS (4)
#define NUM_OTHER_BUCKETS (0)
#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
#ifdef ERR_SQUARED
#define STEPSIZE (0.00000001f * 0.000001f)
#else
#define STEPSIZE (0.00000001f)
#endif
#define clz(x) __builtin_clz(x)
#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
#define coord(x,y,w) ((x)+((y)*(w)))
void update_result(const uint64_t *buckets, uint64_t ccc, const double *mat, double *res)
{
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
double addend = 0.0;
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
addend += mat[coord(x, y, NUM_TOTAL_BUCKETS)] * (double)buckets[x];
}
addend *= (double)ccc;
res[y] += addend;
}
}
void read_matrix(const char *fn, double *mat)
{
FILE *f = fopen(fn, "r");
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
float curr;
fscanf(f, "%f", &curr);
mat[x + y * NUM_TOTAL_BUCKETS] = curr;
}
}
fclose(f);
}
void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs)
{
uint32_t i;
for (i = 0; i < size; i++) {
int16_t curr = buf[i];
int16_t is_signed = curr >> 15;
*num_signs += (is_signed & 1);
uint16_t abs = (curr ^ is_signed) - is_signed;
if (abs >= NUM_COEFF_BUCKETS)
abs = NUM_COEFF_BUCKETS - 1;
buckets[abs]++;
}
}
static inline int is_power_of_two(uint32_t u)
{
return (u & (u - 1)) == 0;
}
int process_rdcosts(FILE *in, FILE *out, const double *mat)
{
void *buf = malloc(BUFSZ);
uint32_t *u32buf = (uint32_t *)buf;
int16_t *i16buf = (int16_t *)buf;
int rv = 0;
double res[NUM_TOTAL_BUCKETS] = {0.0};
while (!feof(in)) {
uint32_t size, ccc, size_sqrt;
uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
uint64_t cg_num_signs = 0;
size_t n_read;
n_read = fread(buf, sizeof(uint32_t), 2, in);
size = u32buf[0];
ccc = u32buf[1];
// Can't rely on feof() alone when reading from a pipe that might only get
// closed long after the last data has been poured in
if (n_read == 0) {
break;
}
if (feof(in) || n_read < 2) {
fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
rv = 1;
goto out;
}
if (!is_power_of_two(size)) {
fprintf(stderr, "Errorneous block size %u\n", size);
rv = 1;
goto out;
}
size_sqrt = 1 << (ilog2(size) >> 1);
n_read = fread(buf, sizeof(int16_t), size, in);
if (n_read != size) {
fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
rv = 1;
goto out;
}
count_coeffs(i16buf, size, cg_buckets, &cg_num_signs);
update_result(cg_buckets, ccc, mat, res);
}
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++)
fprintf(out, "%g\n", (float)(res[y]));
out:
free(buf);
return rv;
}
int main(int ar, char **av)
{
double mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0.0};
if (ar != 2) {
fprintf(stderr, "gib matrix plz\n");
return 1;
}
read_matrix(av[1], mat);
return process_rdcosts(stdin, stdout, mat);
}

View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
import glob
import sys
result_path_template = "/tmp/rdcost/coeff_buckets/*-qp%02i.result"
def main():
results = []
for qp in range(51):
curr_sums = [0.0] * 4
curr_count = 0
result_files = glob.glob(result_path_template % qp)
for fn in result_files:
with open(fn) as f:
contents = f.readlines()
if (len(contents) != 4):
print("Faulty file contents at %s, skipping" % fn, file=sys.stderr)
continue
nums = tuple(map(float, contents))
if (all(n == 0.0 for n in nums)):
print("All-zero file %s, skipping" % fn)
continue
curr_count += 1
for i in range(len(curr_sums)):
curr_sums[i] += nums[i]
if (curr_count > 0):
curr_avgs = tuple(curr_sum / curr_count for curr_sum in curr_sums)
else:
curr_avgs = (0, 0, 0, 0)
results.append(curr_avgs)
print("\n".join(("QP %2i: " % i + ", ".join("%.6f" for _ in range(4)) % res for i, res in enumerate(results))))
if (__name__ == "__main__"):
main()

154
rdcost-weight-tool/run_filter.py Executable file
View file

@ -0,0 +1,154 @@
#!/usr/bin/env python3
import glob
import gzip
import os
import re
import subprocess
import sys
import tempfile
import threading
import time
# You should change these to your liking
n_threads = 8
datadirs = "/tmp/rdcost/data/"
resultdir = "/tmp/rdcost/coeff_buckets"
gzargs = ["gzip", "-d"]
filtargs = ["./frcosts_matrix"]
octargs = ["octave-cli", "invert_matrix.m"]
filt2args = ["./ols_2ndpart"]
class MultiPipeManager:
pipe_fn_template = "%02i.txt"
def __init__(self, odpath, dest_qps):
self.odpath = odpath
self.dest_qps = dest_qps
self.pipe_fns = []
for qp in dest_qps:
pipe_fn = os.path.join(self.odpath, self.pipe_fn_template % qp)
self.pipe_fns.append(pipe_fn)
def __enter__(self):
os.makedirs(self.odpath, exist_ok=True)
for pipe_fn in self.pipe_fns:
try:
os.unlink(pipe_fn)
except FileNotFoundError:
pass
os.mkfifo(pipe_fn)
return self
def __exit__(self, *_):
for pipe_fn in self.pipe_fns:
os.unlink(pipe_fn)
def items(self):
for pipe_fn in self.pipe_fns:
yield pipe_fn
class MTSafeIterable:
def __init__(self, iterable):
self.lock = threading.Lock()
self.iterable = iterable
def __iter__(self):
return self
def __next__(self):
with self.lock:
return next(self.iterable)
def read_in_blocks(f):
BLOCK_SZ = 65536
while True:
block = f.read(BLOCK_SZ)
if (len(block) == 0):
break
else:
yield block
def exhaust_gzs(sink_f, gzs):
for gz in gzs:
with gzip.open(gz, "rb") as f:
if (gz == "/tmp/rdcost/data/RaceHorses_416x240_30.yuv-qp22/20.txt.gz"):
print("kjeh")
print(" Doing %s ..." % gz)
for block in read_in_blocks(f):
sink_f.write(block)
sink_f.flush()
def run_job(jobname, input_gzs):
resultpath = os.path.join(resultdir, "%s.result" % jobname)
print("Running job %s" % jobname)
with tempfile.NamedTemporaryFile() as tf:
filt = subprocess.Popen(filtargs, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
octa = subprocess.Popen(octargs, stdin=filt.stdout, stdout=tf)
try:
exhaust_gzs(filt.stdin, input_gzs)
except OSError as e:
print("OSError %s" % e, file=sys.stderr)
raise
filt.stdin.close()
filt.wait()
octa.wait()
if (filt.returncode != 0):
print("First stage failed: %s" % jobname, file=sys.stderr)
assert(0)
with open(resultpath, "w") as rf:
f2a = filt2args + [tf.name]
f2 = subprocess.Popen(f2a, stdin=subprocess.PIPE, stdout=rf)
exhaust_gzs(f2.stdin, input_gzs)
f2.communicate()
if (filt.returncode != 0):
print("Second stage failed: %s" % jobname, file=sys.stderr)
assert(0)
print("Job %s done" % jobname)
def threadfunc(joblist):
for jobname, job in joblist:
run_job(jobname, job)
def scan_datadirs(path):
seq_names = set()
for dirent in os.scandir(path):
if (not dirent.is_dir()):
continue
match = re.search("^([A-Za-z0-9_]+\.yuv)-qp[0-9]{1,2}$", dirent.name)
if (not match is None):
seq_name = match.groups()[0]
seq_names.add(seq_name)
for seq_name in seq_names:
seq_glob = os.path.join(path, seq_name + "-qp*/")
for qp in range(51):
job_name = seq_name + "-qp%02i" % qp
qp_fn = "%02i.txt.gz" % qp
yield job_name, glob.glob(os.path.join(seq_glob, qp_fn))
def main():
for d in (datadirs, resultdir):
os.makedirs(d, exist_ok=True)
jobs = scan_datadirs(datadirs)
joblist = MTSafeIterable(iter(jobs))
threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_threads)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if (__name__ == "__main__"):
main()

View file

@ -73,6 +73,8 @@ libkvazaar_la_SOURCES = \
encoder_state-geometry.h \
encode_coding_tree.c \
encode_coding_tree.h \
fast_coeff_cost.c \
fast_coeff_cost.h \
filter.c \
filter.h \
global.h \

View file

@ -81,6 +81,7 @@ int kvz_config_init(kvz_config *cfg)
cfg->vui.chroma_loc = 0; /* left center */
cfg->aud_enable = 0;
cfg->cqmfile = NULL;
cfg->fast_coeff_table_fn = NULL;
cfg->ref_frames = 1;
cfg->gop_len = 4;
cfg->gop_lowdelay = true;
@ -176,6 +177,10 @@ int kvz_config_init(kvz_config *cfg)
cfg->stats_file_prefix = NULL;
cfg->fastrd_sampling_on = 0;
cfg->fastrd_accuracy_check_on = 0;
cfg->fastrd_learning_outdir_fn = NULL;
int8_t in[] = { 17, 27, 32, 44 };
int8_t out[] = { 17, 29, 34, 41 };
@ -196,11 +201,13 @@ int kvz_config_destroy(kvz_config *cfg)
{
if (cfg) {
FREE_POINTER(cfg->cqmfile);
FREE_POINTER(cfg->fast_coeff_table_fn);
FREE_POINTER(cfg->tiles_width_split);
FREE_POINTER(cfg->tiles_height_split);
FREE_POINTER(cfg->slice_addresses_in_ts);
FREE_POINTER(cfg->roi.dqps);
FREE_POINTER(cfg->optional_key);
FREE_POINTER(cfg->fastrd_learning_outdir_fn);
if (cfg->param_set_map)
{
FREE_POINTER(cfg->param_set_map);
@ -904,6 +911,30 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
cfg->cqmfile = cqmfile;
cfg->scaling_list = KVZ_SCALING_LIST_CUSTOM;
}
else if OPT("fast-coeff-table") {
char* fast_coeff_table_fn = strdup(value);
if (!fast_coeff_table_fn) {
fprintf(stderr, "Failed to allocate memory for fast coeff table file name.\n");
return 0;
}
FREE_POINTER(cfg->fast_coeff_table_fn);
cfg->fast_coeff_table_fn = fast_coeff_table_fn;
}
else if OPT("fastrd-sampling") {
cfg->fastrd_sampling_on = 1;
}
else if OPT("fastrd-accuracy-check") {
cfg->fastrd_accuracy_check_on = 1;
}
else if OPT("fastrd-outdir") {
char *fastrd_learning_outdir_fn = strdup(value);
if (!fastrd_learning_outdir_fn) {
fprintf(stderr, "Failed to allocate memory for fast RD learning outfile name.\n");
return 0;
}
FREE_POINTER(cfg->fastrd_learning_outdir_fn);
cfg->fastrd_learning_outdir_fn = fastrd_learning_outdir_fn;
}
else if OPT("scaling-list") {
int8_t scaling_list = KVZ_SCALING_LIST_OFF;
int result = parse_enum(value, scaling_list_names, &scaling_list);

View file

@ -155,6 +155,10 @@ static const struct option long_options[] = {
{ "no-clip-neighbour", no_argument, NULL, 0 },
{ "input-file-format", required_argument, NULL, 0 },
{ "stats-file-prefix", required_argument, NULL, 0 },
{ "fast-coeff-table", required_argument, NULL, 0 },
{ "fastrd-sampling", no_argument, NULL, 0 },
{ "fastrd-accuracy-check", no_argument, NULL, 0 },
{ "fastrd-outdir", required_argument, NULL, 0 },
{0, 0, 0, 0}
};
@ -577,6 +581,16 @@ void print_help(void)
" - sensitive: Terminate even earlier.\n"
" --fast-residual-cost <int> : Skip CABAC cost for residual coefficients\n"
" when QP is below the limit. [0]\n"
" --fast-coeff-table <string> : Read custom weights for residual\n"
" coefficients from a file instead of using\n"
" defaults [default]\n"
" --fast-rd-sampling : Enable learning data sampling for fast coefficient\n"
" table generation\n"
" --fastrd-accuracy-check : Evaluate the accuracy of fast coefficient\n"
" prediction\n"
" --fastrd-outdir : Directory to which to output sampled data or accuracy\n"
" data, into <fastrd-outdir>/0.txt to 50.txt, one file\n"
" for each QP that blocks were estimated on\n"
" --(no-)intra-rdo-et : Check intra modes in rdo stage only until\n"
" a zero coefficient CU is found. [disabled]\n"
" --(no-)early-skip : Try to find skip cu from merge candidates.\n"

View file

@ -279,7 +279,11 @@ done:
// Do some cleaning up.
args->api->picture_free(frame_in);
// This thread exit call causes problems with media auto-build suite
// The environment compiles with MINGW using a different pthreads lib
#if !defined(__MINGW32__) && !defined(__MINGW64__)
pthread_exit(NULL);
#endif
return NULL;
}

View file

@ -25,6 +25,7 @@
#include "cu.h"
#include "encoder.h"
#include "extras/crypto.h"
#include "global.h"
#include "imagelist.h"
#include "inter.h"
#include "intra.h"
@ -351,8 +352,9 @@ static void encode_transform_coeff(encoder_state_t * const state,
if (state->must_code_qp_delta) {
const int qp_pred = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
const int qp_delta = cur_cu->qp - qp_pred;
assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
// Possible deltaQP range depends on bit depth as stated in HEVC specification.
assert(qp_delta >= KVZ_QP_DELTA_MIN && qp_delta <= KVZ_QP_DELTA_MAX && "QP delta not in valid range.");
const int qp_delta_abs = ABS(qp_delta);
cabac_data_t* cabac = &state->cabac;

View file

@ -28,9 +28,10 @@
#include "cfg.h"
#include "gop.h"
#include "rdo.h"
#include "strategyselector.h"
#include "kvz_math.h"
#include "fast_coeff_cost.h"
/**
* \brief Strength of QP adjustments when using adaptive QP for 360 video.
@ -275,6 +276,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
encoder->cfg.tiles_width_split = NULL;
encoder->cfg.tiles_height_split = NULL;
encoder->cfg.slice_addresses_in_ts = NULL;
encoder->cfg.fast_coeff_table_fn = NULL;
if (encoder->cfg.gop_len > 0) {
if (encoder->cfg.gop_lowdelay) {
@ -287,7 +289,8 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
}
if( encoder->cfg.intra_qp_offset_auto ) {
encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? -kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1 : 0;
// Limit offset to -3 since HM/VTM seems to use it even for 32 frame gop
encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? MAX(-(int8_t)kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1, -3) : 0;
}
// Disable GOP and QP offset for all-intra coding
@ -381,6 +384,31 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
encoder->scaling_list.use_default_list = 1;
}
if (cfg->fast_coeff_table_fn) {
FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
if (fast_coeff_table_f == NULL) {
fprintf(stderr, "Could not open fast coeff table file.\n");
goto init_failed;
}
if (kvz_fast_coeff_table_parse(&encoder->fast_coeff_table, fast_coeff_table_f) != 0) {
fprintf(stderr, "Failed to parse fast coeff table, using default\n");
kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
}
fclose(fast_coeff_table_f);
} else {
kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
}
if (cfg->fastrd_sampling_on || cfg->fastrd_accuracy_check_on) {
if (cfg->fastrd_learning_outdir_fn == NULL) {
fprintf(stderr, "No output file defined for Fast RD sampling or accuracy check.\n");
goto init_failed;
}
if (kvz_init_rdcost_outfiles(cfg->fastrd_learning_outdir_fn) != 0) {
goto init_failed;
}
}
kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth);
kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height);
@ -742,6 +770,8 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
if (encoder->qp_map[i]) FREE_POINTER(encoder->qp_map[i]);
}
kvz_close_rdcost_outfiles();
free(encoder);
}

View file

@ -30,7 +30,7 @@
#include "kvazaar.h"
#include "scalinglist.h"
#include "threadqueue.h"
#include "fast_coeff_cost.h"
/* Encoder control options, the main struct */
typedef struct encoder_control_t
@ -135,6 +135,8 @@ typedef struct encoder_control_t
int32_t poc_lsb_bits;
fast_coeff_table_t fast_coeff_table;
int8_t* qp_map[3];
} encoder_control_t;

5
src/estimate.m Normal file
View file

@ -0,0 +1,5 @@
data = dlmread("/dev/stdin", " ");
coeffs = data(1:end, 1:5);
costs = data(1:end, 6);
[beta, sigma, r] = ols(costs, coeffs);
disp(beta)

56
src/fast_coeff_cost.c Normal file
View file

@ -0,0 +1,56 @@
#include "fast_coeff_cost.h"
#include "kvazaar.h"
#include "encoderstate.h"
// Note: Assumes that costs are non-negative, for pretty obvious reasons
static uint16_t to_q88(float f)
{
return (uint16_t)(f * 256.0f + 0.5f);
}
static uint64_t to_4xq88(const float f[4])
{
int i;
uint64_t result = 0;
for (i = 3; i >= 0; i--) {
result <<= 16;
result |= to_q88(f[i]);
}
return result;
}
int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f)
{
int i;
uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
float curr_wts[4];
if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
curr_wts + 1,
curr_wts + 2,
curr_wts + 3) != 4) {
return 1;
}
wts_by_qp[i] = to_4xq88(curr_wts);
}
return 0;
}
void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table)
{
int i;
uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
wts_by_qp[i] = to_4xq88(default_fast_coeff_cost_wts[i]);
}
}
uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state)
{
const fast_coeff_table_t *table = &(state->encoder_control->fast_coeff_table);
return table->wts_by_qp[state->qp];
}

78
src/fast_coeff_cost.h Normal file
View file

@ -0,0 +1,78 @@
#ifndef FAST_COEFF_COST_H_
#define FAST_COEFF_COST_H_
#include <stdio.h>
#include "kvazaar.h"
// #include "encoderstate.h"
#define MAX_FAST_COEFF_COST_QP 50
typedef struct {
uint64_t wts_by_qp[MAX_FAST_COEFF_COST_QP];
} fast_coeff_table_t;
// Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
// 0 to MAX_FAST_COEFF_COST_QP
static const float default_fast_coeff_cost_wts[][4] = {
// Just extend it by stretching the first actual values..
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
{0.164240, 4.161530, 3.509033, 6.928047},
// up to here
{0.164240, 4.161530, 3.509033, 6.928047},
{0.162844, 4.055940, 3.564467, 6.861493},
{0.128729, 4.311973, 3.942837, 6.935403},
{0.110956, 4.433190, 3.945753, 6.877697},
{0.095026, 4.483547, 4.194173, 6.781540},
{0.075046, 4.633703, 4.084193, 6.698600},
{0.052426, 4.967223, 4.027210, 6.549197},
{0.040219, 5.141820, 3.982650, 6.461557},
{0.035090, 5.192493, 3.830950, 6.418477},
{0.029845, 5.211647, 3.815457, 6.345440},
{0.023522, 5.322213, 3.816537, 6.360677},
{0.021305, 5.225923, 3.842700, 6.325787},
{0.015878, 5.183090, 3.956003, 6.329680},
{0.010430, 5.099230, 4.176803, 6.305400},
{0.008433, 5.030257, 4.237587, 6.270133},
{0.006500, 4.969247, 4.339397, 6.217827},
{0.004929, 4.923500, 4.442413, 6.183523},
{0.003715, 4.915583, 4.429090, 6.125320},
{0.003089, 4.883907, 4.562790, 6.156447},
{0.002466, 4.881063, 4.629883, 6.142643},
{0.002169, 4.882493, 4.646313, 6.127663},
{0.002546, 4.793337, 4.837413, 6.199270},
{0.001314, 4.808853, 4.828337, 6.243437},
{0.001154, 4.862603, 4.846883, 6.205523},
{0.000984, 4.866403, 4.859330, 6.240893},
{0.000813, 4.856633, 4.924527, 6.293413},
{0.001112, 4.789260, 5.009880, 6.433540},
{0.000552, 4.760747, 5.090447, 6.599380},
{0.000391, 4.961447, 5.111033, 6.756370},
{0.000332, 4.980953, 5.138127, 6.867420},
{0.000201, 5.181957, 4.740160, 6.460997},
{0.000240, 5.185390, 4.874840, 6.819093},
{0.000130, 5.270350, 4.734213, 6.826240},
{0.000104, 5.371937, 4.595087, 6.659253},
{0.000083, 5.362000, 4.617470, 6.837770},
{0.000069, 5.285997, 4.754993, 7.159043},
{0.000049, 5.488470, 4.396107, 6.727357},
{0.000058, 4.958940, 4.580460, 6.477740},
{0.000028, 5.521253, 4.440493, 7.205017},
{0.000000, 0.000000, 0.000000, 0.000000},
{0.000019, 5.811260, 4.399110, 7.336310},
};
typedef struct encoder_state_t encoder_state_t;
int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f);
void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table);
uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state);
#endif // FAST_COEFF_COST_H_

View file

@ -379,4 +379,8 @@ typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
# define COMPILE_ARM 0
#endif
// Min & max delta QP limits based on bit depth
#define KVZ_QP_DELTA_MIN -(26 + 3 * (KVZ_BIT_DEPTH - 8))
#define KVZ_QP_DELTA_MAX 25 + 3 * (KVZ_BIT_DEPTH - 8)
#endif

View file

@ -483,33 +483,46 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
ref->stride) >> (KVZ_BIT_DEPTH - 8);
} else {
// Extrapolate pixels from outside the frame.
kvz_extended_block block;
kvz_get_extended_block(pic_x,
pic_y,
ref_x - pic_x,
ref_y - pic_y,
0,
0,
ref->y,
ref->width,
ref->height,
0,
block_width,
block_height,
&block);
// Space for extrapolated pixels and the part from the picture
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[LCU_LUMA_SIZE];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = ref_x,
.blk_y = ref_y,
.blk_w = block_width,
.blk_h = block_height,
.pad_l = 0,
.pad_r = 0,
.pad_t = 0,
.pad_b = 0,
.pad_b_simd = 0,
};
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
unsigned satd = kvz_satd_any_size(block_width,
block_height,
pic_data,
pic->stride,
block.buffer,
block.stride) >> (KVZ_BIT_DEPTH - 8);
if (block.malloc_used) {
FREE_POINTER(block.buffer);
}
block_height,
pic_data,
pic->stride,
ext_origin,
ext_s) >> (KVZ_BIT_DEPTH - 8);
return satd;
}

View file

@ -40,224 +40,258 @@ typedef struct {
} merge_candidates_t;
static void inter_recon_frac_luma(const encoder_state_t * const state,
const kvz_picture * const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
lcu_t *lcu)
static void inter_recon_frac_luma(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
lcu_t *lcu)
{
int mv_frac_x = (mv_param[0] & 3);
int mv_frac_y = (mv_param[1] & 3);
// Fractional luma 1/4-pel
kvz_extended_block src = {0, 0, 0, 0};
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
.blk_w = block_width,
.blk_h = block_height,
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 1 // One row for AVX2
};
// Fractional luma
kvz_get_extended_block(xpos,
ypos,
mv_param[0] >> 2,
mv_param[1] >> 2,
state->tile->offset_x,
state->tile->offset_y,
ref->y,
ref->width,
ref->height,
KVZ_LUMA_FILTER_TAPS,
block_width,
block_height,
&src);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_sample_quarterpel_luma(state->encoder_control,
src.orig_topleft,
src.stride,
block_width,
block_height,
lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
LCU_WIDTH,
mv_frac_x,
mv_frac_y,
mv_param);
if (src.malloc_used) free(src.buffer);
ext_origin,
ext_s,
block_width,
block_height,
lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
LCU_WIDTH,
mv_frac_x,
mv_frac_y,
mv_param);
}
static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
const kvz_picture * const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
hi_prec_buf_t *hi_prec_out)
static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
hi_prec_buf_t *hi_prec_out)
{
int mv_frac_x = (mv_param[0] & 3);
int mv_frac_y = (mv_param[1] & 3);
// Fractional luma 1/4-pel
kvz_extended_block src = { 0, 0, 0, 0 };
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
.blk_w = block_width,
.blk_h = block_height,
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 1 // One row for AVX2
};
// Fractional luma
kvz_get_extended_block(xpos,
ypos,
mv_param[0] >> 2,
mv_param[1] >> 2,
state->tile->offset_x,
state->tile->offset_y,
ref->y,
ref->width,
ref->height,
KVZ_LUMA_FILTER_TAPS,
block_width,
block_height,
&src);
kvz_sample_14bit_quarterpel_luma(state->encoder_control,
src.orig_topleft,
src.stride,
block_width,
block_height,
hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
LCU_WIDTH,
mv_frac_x,
mv_frac_y,
mv_param);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
if (src.malloc_used) free(src.buffer);
kvz_get_extended_block(&epol_args);
kvz_sample_quarterpel_luma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width,
block_height,
hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
LCU_WIDTH,
mv_frac_x,
mv_frac_y,
mv_param);
}
static void inter_recon_frac_chroma(const encoder_state_t * const state,
const kvz_picture * const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
lcu_t *lcu)
static void inter_recon_frac_chroma(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
lcu_t *lcu)
{
int mv_frac_x = (mv_param[0] & 7);
int mv_frac_y = (mv_param[1] & 7);
// Translate to chroma
xpos >>= 1;
ypos >>= 1;
block_width >>= 1;
block_height >>= 1;
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
// Fractional chroma 1/8-pel
kvz_extended_block src_u = { 0, 0, 0, 0 };
kvz_extended_block src_v = { 0, 0, 0, 0 };
// Chroma U
// Divisions by 2 due to 4:2:0 chroma subsampling
kvz_epol_args epol_args = {
.src = ref->u,
.src_w = ref->width / 2,
.src_h = ref->height / 2,
.src_s = ref->stride / 2,
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
.blk_w = block_width / 2,
.blk_h = block_height / 2,
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_b_simd = 3 // Three rows for AVX2
};
//Fractional chroma U
kvz_get_extended_block(xpos, ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->u,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_u);
kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
//Fractional chroma V
kvz_get_extended_block(xpos, ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->v,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_v);
kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
block_height, lcu->rec.v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
// Chroma V
epol_args.src = ref->v;
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
}
static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
const kvz_picture * const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
hi_prec_buf_t *hi_prec_out)
static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
int32_t block_height,
const int16_t mv_param[2],
hi_prec_buf_t *hi_prec_out)
{
int mv_frac_x = (mv_param[0] & 7);
int mv_frac_y = (mv_param[1] & 7);
// Translate to chroma
xpos >>= 1;
ypos >>= 1;
block_width >>= 1;
block_height >>= 1;
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
// Fractional chroma 1/8-pel
kvz_extended_block src_u = { 0, 0, 0, 0 };
kvz_extended_block src_v = { 0, 0, 0, 0 };
// Chroma U
// Divisions by 2 due to 4:2:0 chroma subsampling
kvz_epol_args epol_args = {
.src = ref->u,
.src_w = ref->width / 2,
.src_h = ref->height / 2,
.src_s = ref->stride / 2,
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
.blk_w = block_width / 2,
.blk_h = block_height / 2,
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_b_simd = 3 // Three rows for AVX2
};
//Fractional chroma U
kvz_get_extended_block(xpos,
ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->u,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_u);
kvz_sample_14bit_octpel_chroma(state->encoder_control,
src_u.orig_topleft,
src_u.stride,
block_width,
block_height,
hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
//Fractional chroma V
kvz_get_extended_block(xpos,
ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->v,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_v);
kvz_sample_14bit_octpel_chroma(state->encoder_control,
src_v.orig_topleft,
src_v.stride,
block_width,
block_height,
hi_prec_out->v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
// Chroma V
epol_args.src = ref->v;
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
}
@ -348,7 +382,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
if (fractional_luma) {
// With a fractional MV, do interpolation.
if (state->encoder_control->cfg.bipred && hi_prec_out) {
inter_recon_14bit_frac_luma(state, ref,
inter_recon_frac_luma_hi(state, ref,
pu_in_tile.x, pu_in_tile.y,
width, height,
mv_param, hi_prec_out);
@ -386,7 +420,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
if (fractional_luma || fractional_chroma) {
// With a fractional MV, do interpolation.
if (state->encoder_control->cfg.bipred && hi_prec_out) {
inter_recon_14bit_frac_chroma(state, ref,
inter_recon_frac_chroma_hi(state, ref,
pu_in_tile.x, pu_in_tile.y,
width, height,
mv_param, hi_prec_out);

View file

@ -27,6 +27,7 @@
*/
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
@ -473,6 +474,20 @@ typedef struct kvz_config
enum kvz_file_format file_format;
char *stats_file_prefix;
char *fast_coeff_table_fn; /*!< \brief Pointer to fast coeff table filename */
/** \brief whether we're sampling TBs and their costs for fast cost
* estimation training */
uint8_t rdo_cost_sampling_mode_on;
/** \brief whether we're running in normal mode, sampling TBs and their cost
* for fast estimation training, or comparing estimator accuracy to
* CABAC */
uint8_t fastrd_sampling_on;
uint8_t fastrd_accuracy_check_on;
char *fastrd_learning_outdir_fn;
struct param_set_map *param_set_map;

View file

@ -803,9 +803,10 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
int aq_offset = round(state->frame->aq_offsets[id]);
state->qp += aq_offset;
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Clipping range is a function of bit depth
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
state->lambda_sqrt = sqrt(state->lambda);
@ -1149,9 +1150,10 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
int id = lcu_pos.x + lcu_pos.y * state->tile->frame->width_in_lcu;
int aq_offset = round(state->frame->aq_offsets[id]);
state->qp += aq_offset;
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Clipping range is a function of bit depth
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
state->lambda_sqrt = sqrt(state->lambda);

140
src/rdo.c
View file

@ -20,8 +20,10 @@
#include "rdo.h"
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include "cabac.h"
#include "context.h"
@ -43,6 +45,11 @@
#define LOG2_SCAN_SET_SIZE 4
#define SBH_THRESHOLD 4
#define RD_SAMPLING_MAX_LAST_QP 50
static FILE *fastrd_learning_outfile[RD_SAMPLING_MAX_LAST_QP + 1] = {NULL};
static pthread_mutex_t outfile_mutex[RD_SAMPLING_MAX_LAST_QP + 1];
const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
const uint32_t g_auiGoRiceParsCoeff[32] =
@ -152,6 +159,67 @@ struct sh_rates_t {
int32_t quant_delta[32 * 32];
};
int kvz_init_rdcost_outfiles(const char *dir_path)
{
#define RD_SAMPLING_MAX_FN_LENGTH 4095
static const char *basename_tmpl = "/%02i.txt";
char fn_template[RD_SAMPLING_MAX_FN_LENGTH + 1];
char fn[RD_SAMPLING_MAX_FN_LENGTH + 1];
int rv = 0, qp;
// As long as QP is a two-digit number, template and produced string should
// be equal in length ("%i" -> "22")
assert(RD_SAMPLING_MAX_LAST_QP <= 99);
assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
pthread_mutex_t *curr = outfile_mutex + qp;
if (pthread_mutex_init(curr, NULL) != 0) {
fprintf(stderr, "Failed to create mutex\n");
rv = -1;
qp--;
goto out_destroy_mutexes;
}
}
for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
FILE *curr;
snprintf(fn, RD_SAMPLING_MAX_FN_LENGTH, fn_template, qp);
fn[RD_SAMPLING_MAX_FN_LENGTH] = 0;
curr = fopen(fn, "w");
if (curr == NULL) {
fprintf(stderr, "Failed to open %s: %s\n", fn, strerror(errno));
rv = -1;
qp--;
goto out_close_files;
}
fastrd_learning_outfile[qp] = curr;
}
goto out;
out_close_files:
for (; qp >= 0; qp--) {
fclose(fastrd_learning_outfile[qp]);
fastrd_learning_outfile[qp] = NULL;
}
goto out;
out_destroy_mutexes:
for (; qp >= 0; qp--) {
pthread_mutex_destroy(outfile_mutex + qp);
}
goto out;
out:
return rv;
#undef RD_SAMPLING_MAX_FN_LENGTH
}
/**
* \brief Calculate actual (or really close to actual) bitcost for coding
@ -205,6 +273,33 @@ static INLINE uint32_t get_coeff_cabac_cost(
return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
}
static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
{
pthread_mutex_t *mtx = outfile_mutex + qp;
assert(sizeof(coeff_t) == sizeof(int16_t));
assert(qp <= RD_SAMPLING_MAX_LAST_QP);
pthread_mutex_lock(mtx);
fwrite(&size, sizeof(size), 1, fastrd_learning_outfile[qp]);
fwrite(&ccc, sizeof(ccc), 1, fastrd_learning_outfile[qp]);
fwrite( coeff, sizeof(coeff_t), size, fastrd_learning_outfile[qp]);
pthread_mutex_unlock(mtx);
}
static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
{
pthread_mutex_t *mtx = outfile_mutex + qp;
assert(qp <= RD_SAMPLING_MAX_LAST_QP);
pthread_mutex_lock(mtx);
fprintf(fastrd_learning_outfile[qp], "%u %u\n", fast_cost, ccc);
pthread_mutex_unlock(mtx);
}
/**
* \brief Estimate bitcost for coding coefficients.
*
@ -220,14 +315,32 @@ uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
int32_t type,
int8_t scan_mode)
{
if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) {
return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
state->qp < MAX_FAST_COEFF_COST_QP) {
// TODO: do we need to assert(0) out of the fast-estimation branch if we
// are to save block costs, or should we just warn about it somewhere
// earlier (configuration validation I guess)?
if (save_cccs) {
assert(0 && "Fast RD sampling does not work with fast-residual-cost");
return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
} else {
uint64_t weights = kvz_fast_coeff_get_weights(state);
uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
if (check_accuracy) {
uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
save_accuracy(state->qp, ccc, fast_cost);
}
return fast_cost;
}
} else {
// Estimate coeff coding cost based on QP and sum of absolute coeffs.
// const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width);
// return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5);
return kvz_fast_coeff_cost(coeff, width, state->qp);
uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
if (save_cccs) {
save_ccc(state->qp, coeff, width * width, ccc);
}
return ccc;
}
}
@ -1192,3 +1305,18 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
}
void kvz_close_rdcost_outfiles(void)
{
int i;
for (i = 0; i < RD_SAMPLING_MAX_LAST_QP; i++) {
FILE *curr = fastrd_learning_outfile[i];
pthread_mutex_t *curr_mtx = outfile_mutex + i;
if (curr != NULL) {
fclose(curr);
}
if (curr_mtx != NULL) {
pthread_mutex_destroy(curr_mtx);
}
}
}

View file

@ -36,6 +36,9 @@
extern const uint32_t kvz_g_go_rice_range[5];
extern const uint32_t kvz_g_go_rice_prefix_len[5];
int kvz_init_rdcost_outfiles(const char *fn_template);
void kvz_close_rdcost_outfiles(void);
void kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth, uint16_t cbf);

View file

@ -992,12 +992,11 @@ static void search_frac(inter_search_info_t *info)
unsigned costs[4] = { 0 };
kvz_extended_block src = { 0, 0, 0, 0 };
ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];
// Storage buffers for intermediate horizontally filtered results.
// Have the first columns in contiguous memory for vectorization.
ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];
const kvz_picture *ref = info->ref;
@ -1013,20 +1012,45 @@ static void search_frac(inter_search_info_t *info)
int8_t sample_off_x = 0;
int8_t sample_off_y = 0;
kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
state->tile->offset_x,
state->tile->offset_y,
ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
internal_width+1, internal_height+1,
&src);
// Space for (possibly) extrapolated pixels and the part from the picture
// One extra row and column compared to normal interpolation and some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + orig.x + mv.x - 1,
.blk_y = state->tile->offset_y + orig.y + mv.y - 1,
.blk_w = internal_width + 1, // TODO: real width
.blk_h = internal_height + 1, // TODO: real height
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
};
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
int tmp_stride = pic->stride;
// Search integer position
costs[0] = kvz_satd_any_size(width, height,
tmp_pic, tmp_stride,
src.orig_topleft + src.stride + 1, src.stride);
tmp_pic, tmp_stride,
ext_origin + ext_s + 1, ext_s);
costs[0] += info->mvd_cost_func(state,
mv.x, mv.y, 2,
@ -1056,8 +1080,8 @@ static void search_frac(inter_search_info_t *info)
const int mv_shift = (step < 2) ? 1 : 0;
filter_steps[step](state->encoder_control,
src.orig_topleft,
src.stride,
ext_origin,
ext_s,
internal_width,
internal_height,
filtered,
@ -1131,8 +1155,6 @@ static void search_frac(inter_search_info_t *info)
info->best_mv = mv;
info->best_cost = best_cost;
info->best_bitcost = best_bitcost;
if (src.malloc_used) free(src.buffer);
}
/**

File diff suppressed because it is too large Load diff

View file

@ -40,6 +40,7 @@
#include "strategyselector.h"
#include "tables.h"
#include "transform.h"
#include "fast_coeff_cost.h"
static INLINE int32_t hsum32_8x32i(__m256i src)
{
@ -814,81 +815,63 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
return parts[0] + parts[1] + parts[2] + parts[3];
}
#define TO_Q88(f) ((int16_t)((f) * 256.0f))
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t qp)
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
{
#define NUM_BUCKETS 5
static const int16_t wt_m[NUM_BUCKETS] = {
TO_Q88(-0.004916),
TO_Q88( 0.010806),
TO_Q88( 0.055562),
TO_Q88( 0.033436),
TO_Q88(-0.007690),
};
static const int16_t wt_c[NUM_BUCKETS] = {
TO_Q88( 0.172024),
TO_Q88( 3.421462),
TO_Q88( 2.879506),
TO_Q88( 5.585471),
TO_Q88( 0.256772),
};
const __m256i zero = _mm256_setzero_si256();
const __m256i threes = _mm256_set1_epi16(3);
const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
const __m128i wt_extract_los = _mm_cvtsi32_si128(0x06040200);
const __m128i wt_extract_his = _mm_cvtsi32_si128(0x07050301);
const __m256i zero = _mm256_setzero_si256();
const __m256i threes = _mm256_set1_epi16(3);
const __m256i ones = _mm256_srli_epi16(threes, 1);
const __m256i twos = _mm256_slli_epi16(ones, 1);
__m256i lo_sum = _mm256_setzero_si256();
__m256i hi_sum = _mm256_setzero_si256();
__m256i wt[NUM_BUCKETS - 1];
for (int32_t i = 0; i < NUM_BUCKETS - 1; i++)
wt[i] = _mm256_set1_epi16(wt_m[i] * qp + wt_c[i]);
__m128i wts_128 = _mm_loadl_epi64 ((const __m128i *)&weights);
__m128i wts_lo_128 = _mm_shuffle_epi8(wts_128, wt_extract_los);
__m128i wts_hi_128 = _mm_shuffle_epi8(wts_128, wt_extract_his);
uint32_t wid_wt = width * (wt_m[NUM_BUCKETS - 1] * qp + wt_c[NUM_BUCKETS - 1]);
__m256i avx_inc = _mm256_setzero_si256();
__m256i wts_lo = _mm256_broadcastsi128_si256(wts_lo_128);
__m256i wts_hi = _mm256_broadcastsi128_si256(wts_hi_128);
for (int32_t i = 0; i < width * width; i += 16) {
__m256i curr = _mm256_loadu_si256((__m256i *)(coeff + i));
__m256i curr_abs = _mm256_abs_epi16 (curr);
__m256i curr_max3 = _mm256_min_epi16 (curr_abs, threes);
for (int i = 0; i < width * width; i += 32) {
__m256i curr_lo = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
__m256i curr_abs_lo = _mm256_abs_epi16 (curr_lo);
__m256i curr_max3_lo = _mm256_min_epu16 (curr_abs_lo, threes);
__m256i curr_eq_0 = _mm256_cmpeq_epi16(curr_max3, zero);
__m256i curr_eq_1 = _mm256_cmpeq_epi16(curr_max3, ones);
__m256i curr_eq_2 = _mm256_cmpeq_epi16(curr_max3, twos);
__m256i curr_eq_3 = _mm256_cmpeq_epi16(curr_max3, threes);
// 4x4 blocks only have 16 coeffs, so handle them separately
__m256i curr_max3_hi;
if (width >= 8) {
__m256i curr_hi = _mm256_loadu_si256 ((const __m256i *)(coeff + i + 16));
__m256i curr_abs_hi = _mm256_abs_epi16 (curr_hi);
curr_max3_hi = _mm256_min_epu16 (curr_abs_hi, threes);
curr_max3_hi = _mm256_slli_epi16 (curr_max3_hi, 8);
} else {
// Set MSBs for high bytes if they're meaningless, so shuffles will
// return zeros for them
curr_max3_hi = negate_hibytes;
}
__m256i curr_max3 = _mm256_or_si256 (curr_max3_lo, curr_max3_hi);
__m256i curr_wts_lo = _mm256_shuffle_epi8(wts_lo, curr_max3);
__m256i curr_wts_hi = _mm256_shuffle_epi8(wts_hi, curr_max3);
__m256i curr_0_wt = _mm256_and_si256 (curr_eq_0, wt[0]);
__m256i curr_1_wt = _mm256_and_si256 (curr_eq_1, wt[1]);
__m256i curr_2_wt = _mm256_and_si256 (curr_eq_2, wt[2]);
__m256i curr_3_wt = _mm256_and_si256 (curr_eq_3, wt[3]);
__m256i curr_sum_lo = _mm256_sad_epu8 (curr_wts_lo, zero);
__m256i curr_sum_hi = _mm256_sad_epu8 (curr_wts_hi, zero);
// Use madd to horizontally sum 16-bit weights into 32-bit atoms
__m256i wt_0_32b = _mm256_madd_epi16(curr_0_wt, ones);
__m256i wt_1_32b = _mm256_madd_epi16(curr_1_wt, ones);
__m256i wt_2_32b = _mm256_madd_epi16(curr_2_wt, ones);
__m256i wt_3_32b = _mm256_madd_epi16(curr_3_wt, ones);
__m256i wt_01 = _mm256_add_epi32(wt_0_32b, wt_1_32b);
__m256i wt_23 = _mm256_add_epi32(wt_2_32b, wt_3_32b);
__m256i curr_wts = _mm256_add_epi32(wt_01, wt_23);
avx_inc = _mm256_add_epi32(avx_inc, curr_wts);
lo_sum = _mm256_add_epi64 (lo_sum, curr_sum_lo);
hi_sum = _mm256_add_epi64 (hi_sum, curr_sum_hi);
}
__m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
__m128i inclo = _mm256_castsi256_si128 (avx_inc);
hi_sum = _mm256_slli_epi64(hi_sum, 8);
__m256i sum0 = _mm256_add_epi64(lo_sum, hi_sum);
__m128i sum_1 = _mm_add_epi32 (inclo, inchi);
__m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sum_3 = _mm_add_epi32 (sum_1, sum_2);
__m128i sum_4 = _mm_shuffle_epi32(sum_3, _MM_SHUFFLE(2, 3, 0, 1));
__m128i sum = _mm_add_epi32 (sum_3, sum_4);
__m256i sum1 = _mm256_permute4x64_epi64(sum0, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum2 = _mm256_add_epi64 (sum0, sum1);
__m256i sum3 = _mm256_shuffle_epi32 (sum2, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum4 = _mm256_add_epi64 (sum2, sum3);
uint32_t sum_u32 = _mm_cvtsi128_si32(sum);
uint32_t sum_total = sum_u32 + wid_wt;
return sum_total >> 8;
#undef NUM_BUCKETS
__m128i sum128 = _mm256_castsi256_si128 (sum4);
return (_mm_cvtsi128_si32(sum128) + (1 << 7)) >> 8;
}
#undef TO_Q88
#endif //COMPILE_INTEL_AVX2 && defined X86_64
int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)

View file

@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
}
}
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
{
//TODO: horizontal and vertical only filtering
int32_t x, y;
@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
}
}
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
{
//TODO: horizontal and vertical only filtering
int32_t x, y;
@ -728,58 +728,54 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
}
void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filter_size, int width, int height, kvz_extended_block *out) {
void kvz_get_extended_block_generic(kvz_epol_args *args) {
int half_filter_size = filter_size >> 1;
int min_y = args->blk_y - args->pad_t;
int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
out->stride = ref_width;
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
out->malloc_used = 0;
int min_x = args->blk_x - args->pad_l;
int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
int min_y = ypos - half_filter_size + off_y + mv_y;
int max_y = min_y + height + filter_size;
int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
if (out_of_bounds_y || out_of_bounds_x) {
int min_x = xpos - half_filter_size + off_x + mv_x;
int max_x = min_x + width + filter_size;
int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
*args->ext = args->buf;
*args->ext_s = args->pad_l + args->blk_w + args->pad_r;
*args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
// Note that stride equals width here.
int cnt_l = CLIP(0, *args->ext_s, -min_x);
int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
if (sample_out_of_bounds){
out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
if (!out->buffer){
fprintf(stderr, "Memory allocation failed!\n");
assert(0);
// For each row including real padding.
// Don't read "don't care" values (SIMD padding). Zero them out.
int y;
for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
kvz_pixel *dst_m = dst_l + cnt_l;
kvz_pixel *dst_r = dst_m + cnt_m;
for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
}
out->stride = width + filter_size;
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
out->malloc_used = 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
// calculate y-pixel offset
coord_y = y + off_y + mv_y;
coord_y = CLIP(0, (ref_height)-1, coord_y);
coord_y *= ref_width;
if (!out_of_bounds_x){
memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
} else {
for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
coord_x = x + off_x + mv_x;
coord_x = CLIP(0, (ref_width)-1, coord_x);
// Store source block data (with extended borders)
out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
}
}
for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
FILL_ARRAY(dst, 0, *args->ext_s);
}
} else {
*args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
*args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
*args->ext_s = args->src_s;
}
}
@ -793,8 +789,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
return success;

View file

@ -32,9 +32,9 @@
int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
#endif //STRATEGIES_IPOL_GENERIC_H_

View file

@ -29,6 +29,7 @@
#include "strategies/strategies-quant.h"
#include "strategyselector.h"
#include "transform.h"
#include "fast_coeff_cost.h"
#define QUANT_SHIFT 14
/**
@ -342,46 +343,30 @@ static uint32_t coeff_abs_sum_generic(const coeff_t *coeffs, size_t length)
return sum;
}
static INLINE int16_t to_q88(float f)
static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
{
return (int16_t)(f * 256.0f);
weights[0] = (wts_packed >> 0) & 0xffff;
weights[1] = (wts_packed >> 16) & 0xffff;
weights[2] = (wts_packed >> 32) & 0xffff;
weights[3] = (wts_packed >> 48) & 0xffff;
}
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp)
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
{
uint32_t sum = 0;
#define NUM_BUCKETS 5
const int16_t wt_m[NUM_BUCKETS] = {
to_q88(-0.004916),
to_q88(0.010806),
to_q88(0.055562),
to_q88(0.033436),
to_q88(-0.007690),
};
const int16_t wt_c[NUM_BUCKETS] = {
to_q88(0.172024),
to_q88(3.421462),
to_q88(2.879506),
to_q88(5.585471),
to_q88(0.256772),
};
uint16_t weights_unpacked[4];
int16_t wt[NUM_BUCKETS];
for (int32_t i = 0; i < NUM_BUCKETS; i++)
wt[i] = wt_m[i] * qp + wt_c[i];
get_coeff_weights(weights, weights_unpacked);
for (int32_t i = 0; i < width * width; i++) {
int16_t curr = coeff[i];
int16_t signmask = curr >> 15;
int16_t curr_abs = (curr ^ signmask) - signmask;
if (curr_abs > 3)
int16_t curr = coeff[i];
uint32_t curr_abs = abs(curr);
if (curr_abs > 3) {
curr_abs = 3;
sum += wt[curr_abs];
}
sum += weights_unpacked[curr_abs];
}
sum += wt[NUM_BUCKETS - 1] * width;
return sum >> 8;
#undef NUM_BUCKETS
return (sum + (1 << 7)) >> 8;
}
int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)

View file

@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
epol_func *kvz_get_extended_block;
kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {

View file

@ -31,21 +31,63 @@
#include "kvazaar.h"
#include "search_inter.h"
// AVX2 implementation of horizontal filter reads and
// writes two rows for luma and four for chroma at a time.
// Extra vertical padding is added to prevent segfaults.
// Horizontal padding is not needed even if one extra byte
// is read because kvz_image_alloc adds enough padding.
#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
// On top of basic interpolation, FME needs one extra
// column and row for ME (left and up). Adding the
// extra row happens to satisfy AVX2 requirements for
// row count. No other extra rows are needed.
#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))
typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t sample_off_x, int8_t sample_off_y);
typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filter_size, int width, int height, kvz_extended_block *out);
typedef struct {
// Source samples
kvz_pixel *src; // Top-left sample
int src_w; // Width
int src_h; // Height
int src_s; // Stride
// Requested sampling position, base dimensions, and padding
int blk_x;
int blk_y;
int blk_w; // Width
int blk_h; // Height
int pad_l; // Left
int pad_r; // Right
int pad_t; // Top
int pad_b; // Bottom
int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
// Buffer for possible extrapolation. Free memory provided by the caller.
kvz_pixel *buf;
// Extended block data. These are set by the function.
kvz_pixel **ext; // Top-left sample with padding
kvz_pixel **ext_origin; // Top-left sample without padding
int *ext_s; // Stride
} kvz_epol_args;
typedef void(epol_func)(kvz_epol_args *args);
typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
// Declare function pointers.
extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
@ -55,8 +97,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
extern epol_func * kvz_get_extended_block;
extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@ -69,8 +111,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
{"filter_qpel_blocks_diag_luma", (void**) &kvz_filter_qpel_blocks_diag_luma}, \
{"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
{"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
{"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
{"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
{"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
{"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
{"get_extended_block", (void**) &kvz_get_extended_block}, \

View file

@ -32,7 +32,6 @@
#include "kvazaar.h"
#include "tables.h"
// Declare function pointers.
typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
@ -45,7 +44,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
bool early_skip);
typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
int32_t height, int8_t type, int8_t block_type);
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t qp);
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

View file

@ -31,7 +31,6 @@
#include "encoderstate.h"
#include "global.h" // IWYU pragma: keep
extern const uint8_t kvz_g_chroma_scale[58];
extern const int16_t kvz_g_inv_quant_scales[6];

View file

@ -1,3 +1,4 @@
race:kvz_eight_tap_filter_hor_8x1_avx2
# AVX2 interpolation reads some extra pixels
race:kvz_ipol_8tap_hor_px_im_avx2
race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
race:kvz_eight_tap_filter_hor_avx2