mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 11:24:05 +00:00
Merge remote-tracking branch 'remotes/kvz_github/master' into Fix-monochrome
# Conflicts: # .gitlab-ci.yml # build/kvazaar_lib/kvazaar_lib.vcxproj.filters # src/cfg.c # src/encoder.h # src/kvazaar.h # src/rdo.c
This commit is contained in:
commit
1aaa95601c
|
@ -33,7 +33,7 @@ test-asan:
|
|||
# variables:
|
||||
# CFLAGS: '-fsanitize=thread'
|
||||
# # Temporarily suppress known errors or false positives.
|
||||
# TSAN_OPTIONS: 'suppressions=/builds/TIE/ultravideo/kvazaar/tests/tsan_suppressions.txt'
|
||||
# TSAN_OPTIONS: 'suppressions=/builds/cs/ultravideo/kvazaar/tests/tsan_suppressions.txt'
|
||||
|
||||
test-ubsan:
|
||||
<<: *test-template
|
||||
|
|
11
README.md
11
README.md
|
@ -117,6 +117,7 @@ Options:
|
|||
bits, lambda, distortion, and qp for each ctu.
|
||||
These are meant for debugging and are not
|
||||
written unless the prefix is defined.
|
||||
|
||||
Video structure:
|
||||
-q, --qp <integer> : Quantization parameter [22]
|
||||
-p, --period <integer> : Period of intra pictures [64]
|
||||
|
@ -253,6 +254,16 @@ Compression tools:
|
|||
- sensitive: Terminate even earlier.
|
||||
--fast-residual-cost <int> : Skip CABAC cost for residual coefficients
|
||||
when QP is below the limit. [0]
|
||||
--fast-coeff-table <string> : Read custom weights for residual
|
||||
coefficients from a file instead of using
|
||||
defaults [default]
|
||||
--fast-rd-sampling : Enable learning data sampling for fast coefficient
|
||||
table generation
|
||||
--fastrd-accuracy-check : Evaluate the accuracy of fast coefficient
|
||||
prediction
|
||||
--fastrd-outdir : Directory to which to output sampled data or accuracy
|
||||
data, into <fastrd-outdir>/0.txt to 50.txt, one file
|
||||
for each QP that blocks were estimated on
|
||||
--(no-)intra-rdo-et : Check intra modes in rdo stage only until
|
||||
a zero coefficient CU is found. [disabled]
|
||||
--(no-)early-skip : Try to find skip cu from merge candidates.
|
||||
|
|
18
appveyor.yml
18
appveyor.yml
|
@ -1,8 +1,3 @@
|
|||
# Only the whitelisted branches get built, regardless of build config
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
|
||||
# Email the author if their commit either failed to build or fixed a failed build
|
||||
# good -> bad, bad -> bad, bad -> good but not good -> good
|
||||
notifications:
|
||||
|
@ -37,13 +32,16 @@ configuration:
|
|||
- Release
|
||||
|
||||
# Build with multiple compilers / build suites
|
||||
image: Visual Studio 2015
|
||||
environment:
|
||||
matrix:
|
||||
- platform: Win32
|
||||
- platform: x64
|
||||
- MSYSTEM: MINGW32
|
||||
- MSYSTEM: MINGW64
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
platform: Win32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
platform: x64
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
|
||||
MSYSTEM: MINGW32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
|
||||
MSYSTEM: MINGW64
|
||||
|
||||
for:
|
||||
-
|
||||
|
|
|
@ -164,6 +164,7 @@
|
|||
<ClCompile Include="..\..\src\nal.c" />
|
||||
<ClCompile Include="..\..\src\rate_control.c" />
|
||||
<ClCompile Include="..\..\src\rdo.c" />
|
||||
<ClCompile Include="..\..\src\fast_coeff_cost.c" />
|
||||
<ClCompile Include="..\..\src\sao.c" />
|
||||
<ClCompile Include="..\..\src\scalinglist.c" />
|
||||
<ClCompile Include="..\..\src\search.c" />
|
||||
|
@ -290,6 +291,7 @@
|
|||
<ClInclude Include="..\..\src\nal.h" />
|
||||
<ClInclude Include="..\..\src\rate_control.h" />
|
||||
<ClInclude Include="..\..\src\rdo.h" />
|
||||
<ClInclude Include="..\..\src\fast_coeff_cost.h" />
|
||||
<ClInclude Include="..\..\src\sao.h" />
|
||||
<ClInclude Include="..\..\src\scalinglist.h" />
|
||||
<ClInclude Include="..\..\src\search.h" />
|
||||
|
|
|
@ -174,6 +174,12 @@
|
|||
<ClCompile Include="..\..\src\rdo.c">
|
||||
<Filter>Compression</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\fast_coeff_cost.c">
|
||||
<Filter>Compression</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\inter.c">
|
||||
<Filter>Reconstruction</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\intra.c">
|
||||
<Filter>Reconstruction</Filter>
|
||||
</ClCompile>
|
||||
|
@ -342,6 +348,9 @@
|
|||
<ClInclude Include="..\..\src\rdo.h">
|
||||
<Filter>Compression</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\fast_coeff_cost.h">
|
||||
<Filter>Compression</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\strategies-common.h">
|
||||
<Filter>Optimization\strategies</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
|
|||
#
|
||||
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
|
||||
ver_major=6
|
||||
ver_minor=3
|
||||
ver_minor=5
|
||||
ver_release=0
|
||||
|
||||
# Prevents configure from adding a lot of defines to the CFLAGS
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH KVAZAAR "1" "September 2020" "kvazaar v2.0.0" "User Commands"
|
||||
.TH KVAZAAR "1" "January 2021" "kvazaar v2.0.0" "User Commands"
|
||||
.SH NAME
|
||||
kvazaar \- open source HEVC encoder
|
||||
.SH SYNOPSIS
|
||||
|
@ -106,6 +106,7 @@ A prefix used for stats files that include
|
|||
bits, lambda, distortion, and qp for each ctu.
|
||||
These are meant for debugging and are not
|
||||
written unless the prefix is defined.
|
||||
|
||||
.SS "Video structure:"
|
||||
.TP
|
||||
\fB\-q\fR, \fB\-\-qp <integer>
|
||||
|
@ -326,6 +327,24 @@ Motion estimation termination [on]
|
|||
Skip CABAC cost for residual coefficients
|
||||
when QP is below the limit. [0]
|
||||
.TP
|
||||
\fB\-\-fast\-coeff\-table <string>
|
||||
Read custom weights for residual
|
||||
coefficients from a file instead of using
|
||||
defaults [default]
|
||||
.TP
|
||||
\fB\-\-fast\-rd\-sampling
|
||||
Enable learning data sampling for fast coefficient
|
||||
table generation
|
||||
.TP
|
||||
\fB\-\-fastrd\-accuracy\-check
|
||||
Evaluate the accuracy of fast coefficient
|
||||
prediction
|
||||
.TP
|
||||
\fB\-\-fastrd\-outdir
|
||||
Directory to which to output sampled data or accuracy
|
||||
data, into <fastrd\-outdir>/0.txt to 50.txt, one file
|
||||
for each QP that blocks were estimated on
|
||||
.TP
|
||||
\fB\-\-(no\-)intra\-rdo\-et
|
||||
Check intra modes in rdo stage only until
|
||||
a zero coefficient CU is found. [disabled]
|
||||
|
|
14
examples/README.md
Normal file
14
examples/README.md
Normal file
|
@ -0,0 +1,14 @@
|
|||
Examples
|
||||
========
|
||||
Examples of external files for use with Kvazaar.
|
||||
|
||||
## Region of interest (roi) files
|
||||
A simple text file can be used with the `--roi` switch to setup regions of interest for encoding.
|
||||
Header row of the file will tell how many regions the encoded frames are divided (columns, rows).
|
||||
The header must be followed by a data row with number entries equal to columns * rows.
|
||||
The data row will tell the encoder which delta QP value will be assigned to each region.
|
||||
The included example file will split frames into four regions with the top regions having a delta QP of +5
|
||||
```
|
||||
2 2
|
||||
5 5 0 0
|
||||
```
|
51
examples/fast_coeff_table.txt
Normal file
51
examples/fast_coeff_table.txt
Normal file
|
@ -0,0 +1,51 @@
|
|||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.046152 4.874163 3.830968 6.617950
|
||||
0.040648 4.920004 3.922710 6.572261
|
||||
0.033854 4.982197 4.021474 6.518219
|
||||
0.027073 5.056451 4.082557 6.471514
|
||||
0.021064 5.125763 4.113825 6.436425
|
||||
0.016605 5.170554 4.119091 6.423091
|
||||
0.012953 5.196849 4.128659 6.422746
|
||||
0.010218 5.194947 4.166336 6.431305
|
||||
0.007970 5.177114 4.217242 6.429468
|
||||
0.006442 5.138598 4.275070 6.396064
|
||||
0.005184 5.093265 4.337876 6.352651
|
||||
0.004134 5.046189 4.413434 6.310742
|
||||
0.003239 5.001028 4.492965 6.264692
|
||||
0.002689 4.959881 4.569652 6.198468
|
||||
0.002280 4.920991 4.642861 6.123074
|
||||
0.001940 4.886799 4.709124 6.049688
|
||||
0.001631 4.858057 4.767754 5.986929
|
||||
0.001409 4.839546 4.813134 5.951025
|
||||
0.001223 4.823649 4.856675 5.933274
|
||||
0.001055 4.806288 4.904500 5.940060
|
||||
0.000899 4.789201 4.950018 5.955955
|
||||
0.000781 4.776673 4.981798 5.982144
|
||||
0.000683 4.766721 5.006732 6.019175
|
||||
0.000603 4.757364 5.030649 6.081959
|
||||
0.000529 4.746016 5.059187 6.158720
|
||||
0.000460 4.729670 5.100437 6.254217
|
||||
0.000397 4.711187 5.150631 6.364452
|
||||
0.000345 4.692304 5.213098 6.506122
|
||||
0.000300 4.674471 5.279962 6.667672
|
||||
0.000264 4.660182 5.342776 6.836979
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
||||
0.000237 4.649543 5.392507 6.977093
|
2
examples/roi.txt
Normal file
2
examples/roi.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
2 2
|
||||
5 5 0 0
|
35
rdcost-weight-tool/README.txt
Normal file
35
rdcost-weight-tool/README.txt
Normal file
|
@ -0,0 +1,35 @@
|
|||
To extract the block costs, build Kvazaar as usual, and edit relevant
|
||||
parameters in the beginning of extract_rdcosts.py and run_filter.py, most
|
||||
importantly the number of cores and the set of video sequences you want to
|
||||
encode to extract costs. Run extract_rdcosts.py, it will use Kvazaar to encode
|
||||
each sequence and extract the costs measured there for the quantized blocks.
|
||||
The costs are stored compressed and sorted by block QP, in the following
|
||||
format:
|
||||
|
||||
Size (B) | Description
|
||||
----------+------------
|
||||
4 | size: Coeff group size, in int16's
|
||||
4 | ccc: Coeff group's coding cost
|
||||
size * 2 | coeffs: Coeff group data
|
||||
|
||||
To analyze the costs by running a linear regression over them, build the two
|
||||
tools using:
|
||||
|
||||
$ gcc filter_rdcosts.c -O2 -o frcosts_matrix
|
||||
$ gcc ols_2ndpart.c -O2 -o ols_2ndpart
|
||||
|
||||
Then run the regression in parallel by running run_filter.py. The reason to do
|
||||
it this way is because the data is stored compressed, so there is no way to
|
||||
mmap it in Matlab/Octave/something; the data sets are absolutely huge (larger
|
||||
than reasonable amounts of RAM in a decent workstation), but this way we can
|
||||
store the data compressed and process it in O(1) memory complexity, so it can
|
||||
be done as widely parallelized as you have CPU cores. The result files each
|
||||
consist of 4 numbers, which represent an approximate linear solution to the
|
||||
corresponding set of costs: the price in bits of a coefficient whose absolute
|
||||
value is a) 0, b) 1, c) 2, d) 3 or higher.
|
||||
|
||||
After that, run rdcost_do_avg.py. It will calculate a per-QP average of the
|
||||
costs over the set of the sequences having been run (ie. for each QP, take the
|
||||
results for that QP for each sequence, and calculate their average). This data
|
||||
is what you can use to fill in the default_fast_coeff_cost_wts table in
|
||||
src/fast_coeff_cost.h.
|
4
rdcost-weight-tool/build.sh
Executable file
4
rdcost-weight-tool/build.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/bin/sh
|
||||
|
||||
gcc -O2 filter_rdcosts.c -o frcosts_matrix
|
||||
gcc -O2 ols_2ndpart.c -o ols_2ndpart
|
166
rdcost-weight-tool/extract_rdcosts.py
Executable file
166
rdcost-weight-tool/extract_rdcosts.py
Executable file
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import glob
|
||||
import gzip
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
|
||||
# Where logs and sampled data will wind up, and where the sequences are read.
|
||||
# Do note that the sequences variable is supposed to be a tuple, because you
|
||||
# could have multiple sets of sequences.
|
||||
logdir = "/tmp/rdcost/logs"
|
||||
ofdir = "/tmp/rdcost/data"
|
||||
sequences = ("/opt/test_seqs/custom_seqs/*/*.yuv",)
|
||||
|
||||
# Note that n_kvazaars * len(dest_qps) has to be less than the max number of
|
||||
# fd's that a process can have (check it out: ulimit -a, likely 1024)
|
||||
smt_threads = 8 # Kinda lazy, but just match this to your cpu
|
||||
n_kvz_threads = 1 # How many threads each kvz instance is running?
|
||||
n_kvazaars = smt_threads // n_kvz_threads
|
||||
|
||||
# You likely will not need to change anything below this line
|
||||
kvz_srcdir = lambda path: os.path.join(
|
||||
os.path.dirname(
|
||||
os.path.dirname(
|
||||
os.path.realpath(__file__)
|
||||
)
|
||||
), "src", path)
|
||||
|
||||
|
||||
dest_qps = tuple(range(51))
|
||||
base_qps = tuple(range(12, 43))
|
||||
|
||||
kvzargs = [kvz_srcdir("kvazaar"), "--threads", str(n_kvz_threads), "--preset=ultrafast", "--fastrd-sampling", "--fast-residual-cost=0"]
|
||||
kvzenv = {"LD_LIBRARY_PATH": kvz_srcdir(".libs/")}
|
||||
|
||||
class MultiPipeGZOutManager:
|
||||
pipe_fn_template = "%02i.txt"
|
||||
gzout_fn_template = "%02i.txt.gz"
|
||||
|
||||
def __init__(self, odpath, dest_qps):
|
||||
self.odpath = odpath
|
||||
self.dest_qps = dest_qps
|
||||
|
||||
self.pipe_fns = []
|
||||
self.gzout_fns = []
|
||||
for qp in dest_qps:
|
||||
pipe_fn = os.path.join(self.odpath, self.pipe_fn_template % qp)
|
||||
gzout_fn = os.path.join(self.odpath, self.gzout_fn_template % qp)
|
||||
|
||||
self.pipe_fns.append(pipe_fn)
|
||||
self.gzout_fns.append(gzout_fn)
|
||||
|
||||
def __enter__(self):
|
||||
os.makedirs(self.odpath, exist_ok=True)
|
||||
for pipe_fn in self.pipe_fns:
|
||||
try:
|
||||
os.unlink(pipe_fn)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
os.mkfifo(pipe_fn)
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
for pipe_fn in self.pipe_fns:
|
||||
os.unlink(pipe_fn)
|
||||
|
||||
def items(self):
|
||||
for pipe_fn, gzout_fn in zip(self.pipe_fns, self.gzout_fns):
|
||||
yield (pipe_fn, gzout_fn)
|
||||
|
||||
class MTSafeIterable:
|
||||
def __init__(self, iterable):
|
||||
self.lock = threading.Lock()
|
||||
self.iterable = iterable
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
with self.lock:
|
||||
return next(self.iterable)
|
||||
|
||||
def combinations(xi, yi):
|
||||
for x in xi:
|
||||
for y in yi:
|
||||
yield (x, y)
|
||||
|
||||
def chain(lol):
|
||||
for l in lol:
|
||||
for i in l:
|
||||
yield i
|
||||
|
||||
# Would've used Popen with gzip, but "gzip [fifo]" with an unconnected fifo
|
||||
# will detect the situation and not block, but just consider it an empty
|
||||
# file. Don't like it when tools outsmart their user..
|
||||
def do_gzip(in_fn, out_fn):
|
||||
BLOCK_SZ = 65536
|
||||
PRINT_MULT = 1024
|
||||
with open(in_fn, "rb") as inf, gzip.open(out_fn, "wb") as outf:
|
||||
num_read = 0
|
||||
print_next_thres = BLOCK_SZ * PRINT_MULT
|
||||
while True:
|
||||
block = inf.read(BLOCK_SZ)
|
||||
num_read += len(block)
|
||||
if (num_read >= print_next_thres):
|
||||
print(" read %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
|
||||
print_next_thres += BLOCK_SZ * PRINT_MULT
|
||||
|
||||
if (len(block) == 0):
|
||||
break
|
||||
outf.write(block)
|
||||
|
||||
print(" finished %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
|
||||
|
||||
def run_job(job):
|
||||
ifpath, qp = job
|
||||
ifname = os.path.basename(ifpath)
|
||||
|
||||
jobname = "%s-qp%i" % (ifname, qp)
|
||||
hevcname = "%s.hevc" % jobname
|
||||
logname = "%s.log" % jobname
|
||||
odname = jobname
|
||||
|
||||
hevcpath = os.path.join("/tmp", hevcname)
|
||||
logpath = os.path.join(logdir, logname)
|
||||
odpath = os.path.join(ofdir, odname)
|
||||
|
||||
my_kvzargs = kvzargs + ["-i", ifpath,
|
||||
"--qp", str(qp),
|
||||
"-o", hevcpath,
|
||||
"--fastrd-outdir", odpath]
|
||||
|
||||
with open(logpath, "w") as lf:
|
||||
with MultiPipeGZOutManager(odpath, dest_qps) as pipes_and_outputs:
|
||||
gzip_threads = []
|
||||
for pipe_fn, out_fn in pipes_and_outputs.items():
|
||||
gzip_thread = threading.Thread(target=do_gzip, args=(pipe_fn, out_fn))
|
||||
gzip_thread.start()
|
||||
gzip_threads.append(gzip_thread)
|
||||
|
||||
kvz = subprocess.Popen(my_kvzargs, env=kvzenv, stderr=lf)
|
||||
kvz.wait()
|
||||
|
||||
def threadfunc(joblist):
|
||||
for job in joblist:
|
||||
run_job(job)
|
||||
|
||||
def main():
|
||||
assert(isinstance(sequences, tuple))
|
||||
for d in (logdir, ofdir):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
jobs = combinations(chain(map(glob.glob, sequences)), base_qps)
|
||||
joblist = MTSafeIterable(jobs)
|
||||
|
||||
threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_kvazaars)]
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
if (__name__ == "__main__"):
|
||||
main()
|
134
rdcost-weight-tool/filter_rdcosts.c
Normal file
134
rdcost-weight-tool/filter_rdcosts.c
Normal file
|
@ -0,0 +1,134 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define BUFSZ (64 * 64 * sizeof(uint16_t))
|
||||
#define NUM_COEFF_BUCKETS (4)
|
||||
#define NUM_OTHER_BUCKETS (0)
|
||||
#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
|
||||
#define MAX_COEFF_BUCKET ((NUM_COEFF_BUCKETS) - 1)
|
||||
|
||||
#define clz(x) __builtin_clz(x)
|
||||
#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
|
||||
|
||||
void print_coeffs(const int16_t *buf, uint32_t size, uint32_t ccc)
|
||||
{
|
||||
uint32_t i;
|
||||
printf("Buf size %u, ccc %u\n", size, ccc);
|
||||
for (i = 0; i < size; i++)
|
||||
printf("%i ", buf[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs, uint16_t *excess)
|
||||
{
|
||||
*excess = 0;
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
int16_t curr = buf[i];
|
||||
int16_t is_signed = curr >> 15;
|
||||
*num_signs += (is_signed & 1);
|
||||
|
||||
uint16_t abs = (curr ^ is_signed) - is_signed;
|
||||
if (abs > MAX_COEFF_BUCKET) {
|
||||
*excess += abs - MAX_COEFF_BUCKET;
|
||||
abs = MAX_COEFF_BUCKET;
|
||||
}
|
||||
|
||||
buckets[abs]++;
|
||||
}
|
||||
}
|
||||
|
||||
void print_buckets(const uint64_t *buckets, uint64_t num_signs)
|
||||
{
|
||||
uint32_t i;
|
||||
for (i = 0; i < NUM_COEFF_BUCKETS; i++)
|
||||
printf("%3u: %lu\n", i, buckets[i]);
|
||||
printf("Signs: %lu\n", num_signs);
|
||||
}
|
||||
|
||||
void update_matrix(const uint64_t *buckets, uint64_t *mat)
|
||||
{
|
||||
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
|
||||
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
|
||||
int curr_pos = y * NUM_TOTAL_BUCKETS + x;
|
||||
mat[curr_pos] += buckets[x] * buckets[y];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int is_power_of_two(uint32_t u)
|
||||
{
|
||||
return (u & (u - 1)) == 0;
|
||||
}
|
||||
|
||||
int process_rdcosts(FILE *in, FILE *out)
|
||||
{
|
||||
void *buf = malloc(BUFSZ);
|
||||
uint32_t *u32buf = (uint32_t *)buf;
|
||||
int16_t *i16buf = (int16_t *)buf;
|
||||
int rv = 0;
|
||||
|
||||
float weights[NUM_TOTAL_BUCKETS] = {0.0f};
|
||||
|
||||
uint64_t mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0};
|
||||
|
||||
while (!feof(in)) {
|
||||
uint32_t size, ccc, size_sqrt;
|
||||
uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
|
||||
uint64_t cg_num_signs = 0;
|
||||
uint16_t excess = 0;
|
||||
size_t n_read;
|
||||
|
||||
n_read = fread(buf, sizeof(uint32_t), 2, in);
|
||||
size = u32buf[0];
|
||||
ccc = u32buf[1];
|
||||
|
||||
// Can't rely on feof() alone when reading from a pipe that might only get
|
||||
// closed long after the last data has been poured in
|
||||
if (n_read == 0) {
|
||||
break;
|
||||
}
|
||||
if (feof(in) || n_read < 2) {
|
||||
fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
if (!is_power_of_two(size)) {
|
||||
fprintf(stderr, "Errorneous block size %u\n", size);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
size_sqrt = 1 << (ilog2(size) >> 1);
|
||||
n_read = fread(buf, sizeof(int16_t), size, in);
|
||||
if (n_read != size) {
|
||||
fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
count_coeffs(i16buf, size, cg_buckets, &cg_num_signs, &excess);
|
||||
update_matrix(cg_buckets, mat);
|
||||
}
|
||||
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
|
||||
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
|
||||
int curr_pos = y * NUM_TOTAL_BUCKETS + x;
|
||||
printf("%lu ", mat[curr_pos]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
return rv;
|
||||
}
|
||||
|
||||
int main(int ar, char **av)
|
||||
{
|
||||
return process_rdcosts(stdin, stdout);
|
||||
}
|
3
rdcost-weight-tool/invert_matrix.m
Normal file
3
rdcost-weight-tool/invert_matrix.m
Normal file
|
@ -0,0 +1,3 @@
|
|||
A = dlmread("/dev/stdin");
|
||||
B = inv(A);
|
||||
dlmwrite("/dev/stdout", B, " ");
|
132
rdcost-weight-tool/ols_2ndpart.c
Normal file
132
rdcost-weight-tool/ols_2ndpart.c
Normal file
|
@ -0,0 +1,132 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define BUFSZ (64 * 64 * sizeof(uint16_t))
|
||||
#define NUM_COEFF_BUCKETS (4)
|
||||
#define NUM_OTHER_BUCKETS (0)
|
||||
#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
|
||||
#ifdef ERR_SQUARED
|
||||
#define STEPSIZE (0.00000001f * 0.000001f)
|
||||
#else
|
||||
#define STEPSIZE (0.00000001f)
|
||||
#endif
|
||||
|
||||
#define clz(x) __builtin_clz(x)
|
||||
#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
|
||||
#define coord(x,y,w) ((x)+((y)*(w)))
|
||||
|
||||
void update_result(const uint64_t *buckets, uint64_t ccc, const double *mat, double *res)
|
||||
{
|
||||
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
|
||||
double addend = 0.0;
|
||||
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
|
||||
addend += mat[coord(x, y, NUM_TOTAL_BUCKETS)] * (double)buckets[x];
|
||||
}
|
||||
addend *= (double)ccc;
|
||||
res[y] += addend;
|
||||
}
|
||||
}
|
||||
|
||||
void read_matrix(const char *fn, double *mat)
|
||||
{
|
||||
FILE *f = fopen(fn, "r");
|
||||
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
|
||||
for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
|
||||
float curr;
|
||||
fscanf(f, "%f", &curr);
|
||||
mat[x + y * NUM_TOTAL_BUCKETS] = curr;
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs)
|
||||
{
|
||||
uint32_t i;
|
||||
for (i = 0; i < size; i++) {
|
||||
int16_t curr = buf[i];
|
||||
int16_t is_signed = curr >> 15;
|
||||
*num_signs += (is_signed & 1);
|
||||
|
||||
uint16_t abs = (curr ^ is_signed) - is_signed;
|
||||
if (abs >= NUM_COEFF_BUCKETS)
|
||||
abs = NUM_COEFF_BUCKETS - 1;
|
||||
|
||||
buckets[abs]++;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int is_power_of_two(uint32_t u)
|
||||
{
|
||||
return (u & (u - 1)) == 0;
|
||||
}
|
||||
|
||||
int process_rdcosts(FILE *in, FILE *out, const double *mat)
|
||||
{
|
||||
void *buf = malloc(BUFSZ);
|
||||
uint32_t *u32buf = (uint32_t *)buf;
|
||||
int16_t *i16buf = (int16_t *)buf;
|
||||
int rv = 0;
|
||||
|
||||
double res[NUM_TOTAL_BUCKETS] = {0.0};
|
||||
|
||||
while (!feof(in)) {
|
||||
uint32_t size, ccc, size_sqrt;
|
||||
uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
|
||||
uint64_t cg_num_signs = 0;
|
||||
size_t n_read;
|
||||
|
||||
n_read = fread(buf, sizeof(uint32_t), 2, in);
|
||||
size = u32buf[0];
|
||||
ccc = u32buf[1];
|
||||
|
||||
// Can't rely on feof() alone when reading from a pipe that might only get
|
||||
// closed long after the last data has been poured in
|
||||
if (n_read == 0) {
|
||||
break;
|
||||
}
|
||||
if (feof(in) || n_read < 2) {
|
||||
fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
if (!is_power_of_two(size)) {
|
||||
fprintf(stderr, "Errorneous block size %u\n", size);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
size_sqrt = 1 << (ilog2(size) >> 1);
|
||||
|
||||
n_read = fread(buf, sizeof(int16_t), size, in);
|
||||
if (n_read != size) {
|
||||
fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
|
||||
rv = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
count_coeffs(i16buf, size, cg_buckets, &cg_num_signs);
|
||||
update_result(cg_buckets, ccc, mat, res);
|
||||
}
|
||||
|
||||
for (int y = 0; y < NUM_TOTAL_BUCKETS; y++)
|
||||
fprintf(out, "%g\n", (float)(res[y]));
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
return rv;
|
||||
}
|
||||
|
||||
int main(int ar, char **av)
|
||||
{
|
||||
double mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0.0};
|
||||
if (ar != 2) {
|
||||
fprintf(stderr, "gib matrix plz\n");
|
||||
return 1;
|
||||
}
|
||||
read_matrix(av[1], mat);
|
||||
return process_rdcosts(stdin, stdout, mat);
|
||||
}
|
||||
|
38
rdcost-weight-tool/rdcost_do_avg.py
Executable file
38
rdcost-weight-tool/rdcost_do_avg.py
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import glob
|
||||
import sys
|
||||
|
||||
result_path_template = "/tmp/rdcost/coeff_buckets/*-qp%02i.result"
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for qp in range(51):
|
||||
curr_sums = [0.0] * 4
|
||||
curr_count = 0
|
||||
result_files = glob.glob(result_path_template % qp)
|
||||
for fn in result_files:
|
||||
with open(fn) as f:
|
||||
contents = f.readlines()
|
||||
if (len(contents) != 4):
|
||||
print("Faulty file contents at %s, skipping" % fn, file=sys.stderr)
|
||||
continue
|
||||
nums = tuple(map(float, contents))
|
||||
if (all(n == 0.0 for n in nums)):
|
||||
print("All-zero file %s, skipping" % fn)
|
||||
continue
|
||||
|
||||
curr_count += 1
|
||||
for i in range(len(curr_sums)):
|
||||
curr_sums[i] += nums[i]
|
||||
|
||||
if (curr_count > 0):
|
||||
curr_avgs = tuple(curr_sum / curr_count for curr_sum in curr_sums)
|
||||
else:
|
||||
curr_avgs = (0, 0, 0, 0)
|
||||
|
||||
results.append(curr_avgs)
|
||||
print("\n".join(("QP %2i: " % i + ", ".join("%.6f" for _ in range(4)) % res for i, res in enumerate(results))))
|
||||
|
||||
if (__name__ == "__main__"):
|
||||
main()
|
154
rdcost-weight-tool/run_filter.py
Executable file
154
rdcost-weight-tool/run_filter.py
Executable file
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import glob
|
||||
import gzip
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
|
||||
# You should change these to your liking
|
||||
n_threads = 8
|
||||
datadirs = "/tmp/rdcost/data/"
|
||||
resultdir = "/tmp/rdcost/coeff_buckets"
|
||||
|
||||
gzargs = ["gzip", "-d"]
|
||||
filtargs = ["./frcosts_matrix"]
|
||||
octargs = ["octave-cli", "invert_matrix.m"]
|
||||
filt2args = ["./ols_2ndpart"]
|
||||
|
||||
class MultiPipeManager:
|
||||
pipe_fn_template = "%02i.txt"
|
||||
|
||||
def __init__(self, odpath, dest_qps):
|
||||
self.odpath = odpath
|
||||
self.dest_qps = dest_qps
|
||||
|
||||
self.pipe_fns = []
|
||||
for qp in dest_qps:
|
||||
pipe_fn = os.path.join(self.odpath, self.pipe_fn_template % qp)
|
||||
self.pipe_fns.append(pipe_fn)
|
||||
|
||||
def __enter__(self):
|
||||
os.makedirs(self.odpath, exist_ok=True)
|
||||
for pipe_fn in self.pipe_fns:
|
||||
try:
|
||||
os.unlink(pipe_fn)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
os.mkfifo(pipe_fn)
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
for pipe_fn in self.pipe_fns:
|
||||
os.unlink(pipe_fn)
|
||||
|
||||
def items(self):
|
||||
for pipe_fn in self.pipe_fns:
|
||||
yield pipe_fn
|
||||
|
||||
class MTSafeIterable:
|
||||
def __init__(self, iterable):
|
||||
self.lock = threading.Lock()
|
||||
self.iterable = iterable
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
with self.lock:
|
||||
return next(self.iterable)
|
||||
|
||||
def read_in_blocks(f):
|
||||
BLOCK_SZ = 65536
|
||||
while True:
|
||||
block = f.read(BLOCK_SZ)
|
||||
if (len(block) == 0):
|
||||
break
|
||||
else:
|
||||
yield block
|
||||
|
||||
def exhaust_gzs(sink_f, gzs):
|
||||
for gz in gzs:
|
||||
with gzip.open(gz, "rb") as f:
|
||||
if (gz == "/tmp/rdcost/data/RaceHorses_416x240_30.yuv-qp22/20.txt.gz"):
|
||||
print("kjeh")
|
||||
print(" Doing %s ..." % gz)
|
||||
for block in read_in_blocks(f):
|
||||
sink_f.write(block)
|
||||
sink_f.flush()
|
||||
|
||||
def run_job(jobname, input_gzs):
|
||||
resultpath = os.path.join(resultdir, "%s.result" % jobname)
|
||||
print("Running job %s" % jobname)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as tf:
|
||||
filt = subprocess.Popen(filtargs, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
octa = subprocess.Popen(octargs, stdin=filt.stdout, stdout=tf)
|
||||
|
||||
try:
|
||||
exhaust_gzs(filt.stdin, input_gzs)
|
||||
except OSError as e:
|
||||
print("OSError %s" % e, file=sys.stderr)
|
||||
raise
|
||||
|
||||
filt.stdin.close()
|
||||
filt.wait()
|
||||
octa.wait()
|
||||
|
||||
if (filt.returncode != 0):
|
||||
print("First stage failed: %s" % jobname, file=sys.stderr)
|
||||
assert(0)
|
||||
|
||||
with open(resultpath, "w") as rf:
|
||||
f2a = filt2args + [tf.name]
|
||||
f2 = subprocess.Popen(f2a, stdin=subprocess.PIPE, stdout=rf)
|
||||
exhaust_gzs(f2.stdin, input_gzs)
|
||||
f2.communicate()
|
||||
if (filt.returncode != 0):
|
||||
print("Second stage failed: %s" % jobname, file=sys.stderr)
|
||||
assert(0)
|
||||
|
||||
print("Job %s done" % jobname)
|
||||
|
||||
def threadfunc(joblist):
|
||||
for jobname, job in joblist:
|
||||
run_job(jobname, job)
|
||||
|
||||
def scan_datadirs(path):
|
||||
seq_names = set()
|
||||
for dirent in os.scandir(path):
|
||||
if (not dirent.is_dir()):
|
||||
continue
|
||||
match = re.search("^([A-Za-z0-9_]+\.yuv)-qp[0-9]{1,2}$", dirent.name)
|
||||
if (not match is None):
|
||||
seq_name = match.groups()[0]
|
||||
seq_names.add(seq_name)
|
||||
|
||||
for seq_name in seq_names:
|
||||
seq_glob = os.path.join(path, seq_name + "-qp*/")
|
||||
|
||||
for qp in range(51):
|
||||
job_name = seq_name + "-qp%02i" % qp
|
||||
qp_fn = "%02i.txt.gz" % qp
|
||||
yield job_name, glob.glob(os.path.join(seq_glob, qp_fn))
|
||||
|
||||
def main():
|
||||
for d in (datadirs, resultdir):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
jobs = scan_datadirs(datadirs)
|
||||
joblist = MTSafeIterable(iter(jobs))
|
||||
|
||||
threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_threads)]
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
if (__name__ == "__main__"):
|
||||
main()
|
|
@ -73,6 +73,8 @@ libkvazaar_la_SOURCES = \
|
|||
encoder_state-geometry.h \
|
||||
encode_coding_tree.c \
|
||||
encode_coding_tree.h \
|
||||
fast_coeff_cost.c \
|
||||
fast_coeff_cost.h \
|
||||
filter.c \
|
||||
filter.h \
|
||||
global.h \
|
||||
|
|
31
src/cfg.c
31
src/cfg.c
|
@ -81,6 +81,7 @@ int kvz_config_init(kvz_config *cfg)
|
|||
cfg->vui.chroma_loc = 0; /* left center */
|
||||
cfg->aud_enable = 0;
|
||||
cfg->cqmfile = NULL;
|
||||
cfg->fast_coeff_table_fn = NULL;
|
||||
cfg->ref_frames = 1;
|
||||
cfg->gop_len = 4;
|
||||
cfg->gop_lowdelay = true;
|
||||
|
@ -176,6 +177,10 @@ int kvz_config_init(kvz_config *cfg)
|
|||
|
||||
cfg->stats_file_prefix = NULL;
|
||||
|
||||
cfg->fastrd_sampling_on = 0;
|
||||
cfg->fastrd_accuracy_check_on = 0;
|
||||
cfg->fastrd_learning_outdir_fn = NULL;
|
||||
|
||||
int8_t in[] = { 17, 27, 32, 44 };
|
||||
int8_t out[] = { 17, 29, 34, 41 };
|
||||
|
||||
|
@ -196,11 +201,13 @@ int kvz_config_destroy(kvz_config *cfg)
|
|||
{
|
||||
if (cfg) {
|
||||
FREE_POINTER(cfg->cqmfile);
|
||||
FREE_POINTER(cfg->fast_coeff_table_fn);
|
||||
FREE_POINTER(cfg->tiles_width_split);
|
||||
FREE_POINTER(cfg->tiles_height_split);
|
||||
FREE_POINTER(cfg->slice_addresses_in_ts);
|
||||
FREE_POINTER(cfg->roi.dqps);
|
||||
FREE_POINTER(cfg->optional_key);
|
||||
FREE_POINTER(cfg->fastrd_learning_outdir_fn);
|
||||
if (cfg->param_set_map)
|
||||
{
|
||||
FREE_POINTER(cfg->param_set_map);
|
||||
|
@ -904,6 +911,30 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
|
|||
cfg->cqmfile = cqmfile;
|
||||
cfg->scaling_list = KVZ_SCALING_LIST_CUSTOM;
|
||||
}
|
||||
else if OPT("fast-coeff-table") {
|
||||
char* fast_coeff_table_fn = strdup(value);
|
||||
if (!fast_coeff_table_fn) {
|
||||
fprintf(stderr, "Failed to allocate memory for fast coeff table file name.\n");
|
||||
return 0;
|
||||
}
|
||||
FREE_POINTER(cfg->fast_coeff_table_fn);
|
||||
cfg->fast_coeff_table_fn = fast_coeff_table_fn;
|
||||
}
|
||||
else if OPT("fastrd-sampling") {
|
||||
cfg->fastrd_sampling_on = 1;
|
||||
}
|
||||
else if OPT("fastrd-accuracy-check") {
|
||||
cfg->fastrd_accuracy_check_on = 1;
|
||||
}
|
||||
else if OPT("fastrd-outdir") {
|
||||
char *fastrd_learning_outdir_fn = strdup(value);
|
||||
if (!fastrd_learning_outdir_fn) {
|
||||
fprintf(stderr, "Failed to allocate memory for fast RD learning outfile name.\n");
|
||||
return 0;
|
||||
}
|
||||
FREE_POINTER(cfg->fastrd_learning_outdir_fn);
|
||||
cfg->fastrd_learning_outdir_fn = fastrd_learning_outdir_fn;
|
||||
}
|
||||
else if OPT("scaling-list") {
|
||||
int8_t scaling_list = KVZ_SCALING_LIST_OFF;
|
||||
int result = parse_enum(value, scaling_list_names, &scaling_list);
|
||||
|
|
14
src/cli.c
14
src/cli.c
|
@ -155,6 +155,10 @@ static const struct option long_options[] = {
|
|||
{ "no-clip-neighbour", no_argument, NULL, 0 },
|
||||
{ "input-file-format", required_argument, NULL, 0 },
|
||||
{ "stats-file-prefix", required_argument, NULL, 0 },
|
||||
{ "fast-coeff-table", required_argument, NULL, 0 },
|
||||
{ "fastrd-sampling", no_argument, NULL, 0 },
|
||||
{ "fastrd-accuracy-check", no_argument, NULL, 0 },
|
||||
{ "fastrd-outdir", required_argument, NULL, 0 },
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
|
@ -577,6 +581,16 @@ void print_help(void)
|
|||
" - sensitive: Terminate even earlier.\n"
|
||||
" --fast-residual-cost <int> : Skip CABAC cost for residual coefficients\n"
|
||||
" when QP is below the limit. [0]\n"
|
||||
" --fast-coeff-table <string> : Read custom weights for residual\n"
|
||||
" coefficients from a file instead of using\n"
|
||||
" defaults [default]\n"
|
||||
" --fast-rd-sampling : Enable learning data sampling for fast coefficient\n"
|
||||
" table generation\n"
|
||||
" --fastrd-accuracy-check : Evaluate the accuracy of fast coefficient\n"
|
||||
" prediction\n"
|
||||
" --fastrd-outdir : Directory to which to output sampled data or accuracy\n"
|
||||
" data, into <fastrd-outdir>/0.txt to 50.txt, one file\n"
|
||||
" for each QP that blocks were estimated on\n"
|
||||
" --(no-)intra-rdo-et : Check intra modes in rdo stage only until\n"
|
||||
" a zero coefficient CU is found. [disabled]\n"
|
||||
" --(no-)early-skip : Try to find skip cu from merge candidates.\n"
|
||||
|
|
|
@ -279,7 +279,11 @@ done:
|
|||
// Do some cleaning up.
|
||||
args->api->picture_free(frame_in);
|
||||
|
||||
// This thread exit call causes problems with media auto-build suite
|
||||
// The environment compiles with MINGW using a different pthreads lib
|
||||
#if !defined(__MINGW32__) && !defined(__MINGW64__)
|
||||
pthread_exit(NULL);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "cu.h"
|
||||
#include "encoder.h"
|
||||
#include "extras/crypto.h"
|
||||
#include "global.h"
|
||||
#include "imagelist.h"
|
||||
#include "inter.h"
|
||||
#include "intra.h"
|
||||
|
@ -351,8 +352,9 @@ static void encode_transform_coeff(encoder_state_t * const state,
|
|||
if (state->must_code_qp_delta) {
|
||||
const int qp_pred = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
|
||||
const int qp_delta = cur_cu->qp - qp_pred;
|
||||
assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
|
||||
assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
|
||||
// Possible deltaQP range depends on bit depth as stated in HEVC specification.
|
||||
assert(qp_delta >= KVZ_QP_DELTA_MIN && qp_delta <= KVZ_QP_DELTA_MAX && "QP delta not in valid range.");
|
||||
|
||||
const int qp_delta_abs = ABS(qp_delta);
|
||||
cabac_data_t* cabac = &state->cabac;
|
||||
|
||||
|
|
|
@ -28,9 +28,10 @@
|
|||
|
||||
#include "cfg.h"
|
||||
#include "gop.h"
|
||||
#include "rdo.h"
|
||||
#include "strategyselector.h"
|
||||
#include "kvz_math.h"
|
||||
|
||||
#include "fast_coeff_cost.h"
|
||||
|
||||
/**
|
||||
* \brief Strength of QP adjustments when using adaptive QP for 360 video.
|
||||
|
@ -275,6 +276,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
|
|||
encoder->cfg.tiles_width_split = NULL;
|
||||
encoder->cfg.tiles_height_split = NULL;
|
||||
encoder->cfg.slice_addresses_in_ts = NULL;
|
||||
encoder->cfg.fast_coeff_table_fn = NULL;
|
||||
|
||||
if (encoder->cfg.gop_len > 0) {
|
||||
if (encoder->cfg.gop_lowdelay) {
|
||||
|
@ -287,7 +289,8 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
|
|||
}
|
||||
|
||||
if( encoder->cfg.intra_qp_offset_auto ) {
|
||||
encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? -kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1 : 0;
|
||||
// Limit offset to -3 since HM/VTM seems to use it even for 32 frame gop
|
||||
encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? MAX(-(int8_t)kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1, -3) : 0;
|
||||
}
|
||||
|
||||
// Disable GOP and QP offset for all-intra coding
|
||||
|
@ -381,6 +384,31 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
|
|||
encoder->scaling_list.use_default_list = 1;
|
||||
}
|
||||
|
||||
if (cfg->fast_coeff_table_fn) {
|
||||
FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
|
||||
if (fast_coeff_table_f == NULL) {
|
||||
fprintf(stderr, "Could not open fast coeff table file.\n");
|
||||
goto init_failed;
|
||||
}
|
||||
if (kvz_fast_coeff_table_parse(&encoder->fast_coeff_table, fast_coeff_table_f) != 0) {
|
||||
fprintf(stderr, "Failed to parse fast coeff table, using default\n");
|
||||
kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
|
||||
}
|
||||
fclose(fast_coeff_table_f);
|
||||
} else {
|
||||
kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
|
||||
}
|
||||
|
||||
if (cfg->fastrd_sampling_on || cfg->fastrd_accuracy_check_on) {
|
||||
if (cfg->fastrd_learning_outdir_fn == NULL) {
|
||||
fprintf(stderr, "No output file defined for Fast RD sampling or accuracy check.\n");
|
||||
goto init_failed;
|
||||
}
|
||||
if (kvz_init_rdcost_outfiles(cfg->fastrd_learning_outdir_fn) != 0) {
|
||||
goto init_failed;
|
||||
}
|
||||
}
|
||||
|
||||
kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth);
|
||||
|
||||
kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height);
|
||||
|
@ -742,6 +770,8 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
|
|||
if (encoder->qp_map[i]) FREE_POINTER(encoder->qp_map[i]);
|
||||
}
|
||||
|
||||
kvz_close_rdcost_outfiles();
|
||||
|
||||
free(encoder);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
#include "kvazaar.h"
|
||||
#include "scalinglist.h"
|
||||
#include "threadqueue.h"
|
||||
|
||||
#include "fast_coeff_cost.h"
|
||||
|
||||
/* Encoder control options, the main struct */
|
||||
typedef struct encoder_control_t
|
||||
|
@ -135,6 +135,8 @@ typedef struct encoder_control_t
|
|||
|
||||
int32_t poc_lsb_bits;
|
||||
|
||||
fast_coeff_table_t fast_coeff_table;
|
||||
|
||||
int8_t* qp_map[3];
|
||||
|
||||
} encoder_control_t;
|
||||
|
|
5
src/estimate.m
Normal file
5
src/estimate.m
Normal file
|
@ -0,0 +1,5 @@
|
|||
data = dlmread("/dev/stdin", " ");
|
||||
coeffs = data(1:end, 1:5);
|
||||
costs = data(1:end, 6);
|
||||
[beta, sigma, r] = ols(costs, coeffs);
|
||||
disp(beta)
|
56
src/fast_coeff_cost.c
Normal file
56
src/fast_coeff_cost.c
Normal file
|
@ -0,0 +1,56 @@
|
|||
#include "fast_coeff_cost.h"
|
||||
#include "kvazaar.h"
|
||||
#include "encoderstate.h"
|
||||
|
||||
// Note: Assumes that costs are non-negative, for pretty obvious reasons
|
||||
static uint16_t to_q88(float f)
|
||||
{
|
||||
return (uint16_t)(f * 256.0f + 0.5f);
|
||||
}
|
||||
|
||||
static uint64_t to_4xq88(const float f[4])
|
||||
{
|
||||
int i;
|
||||
uint64_t result = 0;
|
||||
|
||||
for (i = 3; i >= 0; i--) {
|
||||
result <<= 16;
|
||||
result |= to_q88(f[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f)
|
||||
{
|
||||
int i;
|
||||
uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
|
||||
|
||||
for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
|
||||
float curr_wts[4];
|
||||
|
||||
if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
|
||||
curr_wts + 1,
|
||||
curr_wts + 2,
|
||||
curr_wts + 3) != 4) {
|
||||
return 1;
|
||||
}
|
||||
wts_by_qp[i] = to_4xq88(curr_wts);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table)
|
||||
{
|
||||
int i;
|
||||
uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
|
||||
|
||||
for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
|
||||
wts_by_qp[i] = to_4xq88(default_fast_coeff_cost_wts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state)
|
||||
{
|
||||
const fast_coeff_table_t *table = &(state->encoder_control->fast_coeff_table);
|
||||
return table->wts_by_qp[state->qp];
|
||||
}
|
78
src/fast_coeff_cost.h
Normal file
78
src/fast_coeff_cost.h
Normal file
|
@ -0,0 +1,78 @@
|
|||
#ifndef FAST_COEFF_COST_H_
|
||||
#define FAST_COEFF_COST_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "kvazaar.h"
|
||||
// #include "encoderstate.h"
|
||||
|
||||
#define MAX_FAST_COEFF_COST_QP 50
|
||||
|
||||
typedef struct {
|
||||
uint64_t wts_by_qp[MAX_FAST_COEFF_COST_QP];
|
||||
} fast_coeff_table_t;
|
||||
|
||||
// Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
|
||||
// 0 to MAX_FAST_COEFF_COST_QP
|
||||
static const float default_fast_coeff_cost_wts[][4] = {
|
||||
// Just extend it by stretching the first actual values..
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
// up to here
|
||||
{0.164240, 4.161530, 3.509033, 6.928047},
|
||||
{0.162844, 4.055940, 3.564467, 6.861493},
|
||||
{0.128729, 4.311973, 3.942837, 6.935403},
|
||||
{0.110956, 4.433190, 3.945753, 6.877697},
|
||||
{0.095026, 4.483547, 4.194173, 6.781540},
|
||||
{0.075046, 4.633703, 4.084193, 6.698600},
|
||||
{0.052426, 4.967223, 4.027210, 6.549197},
|
||||
{0.040219, 5.141820, 3.982650, 6.461557},
|
||||
{0.035090, 5.192493, 3.830950, 6.418477},
|
||||
{0.029845, 5.211647, 3.815457, 6.345440},
|
||||
{0.023522, 5.322213, 3.816537, 6.360677},
|
||||
{0.021305, 5.225923, 3.842700, 6.325787},
|
||||
{0.015878, 5.183090, 3.956003, 6.329680},
|
||||
{0.010430, 5.099230, 4.176803, 6.305400},
|
||||
{0.008433, 5.030257, 4.237587, 6.270133},
|
||||
{0.006500, 4.969247, 4.339397, 6.217827},
|
||||
{0.004929, 4.923500, 4.442413, 6.183523},
|
||||
{0.003715, 4.915583, 4.429090, 6.125320},
|
||||
{0.003089, 4.883907, 4.562790, 6.156447},
|
||||
{0.002466, 4.881063, 4.629883, 6.142643},
|
||||
{0.002169, 4.882493, 4.646313, 6.127663},
|
||||
{0.002546, 4.793337, 4.837413, 6.199270},
|
||||
{0.001314, 4.808853, 4.828337, 6.243437},
|
||||
{0.001154, 4.862603, 4.846883, 6.205523},
|
||||
{0.000984, 4.866403, 4.859330, 6.240893},
|
||||
{0.000813, 4.856633, 4.924527, 6.293413},
|
||||
{0.001112, 4.789260, 5.009880, 6.433540},
|
||||
{0.000552, 4.760747, 5.090447, 6.599380},
|
||||
{0.000391, 4.961447, 5.111033, 6.756370},
|
||||
{0.000332, 4.980953, 5.138127, 6.867420},
|
||||
{0.000201, 5.181957, 4.740160, 6.460997},
|
||||
{0.000240, 5.185390, 4.874840, 6.819093},
|
||||
{0.000130, 5.270350, 4.734213, 6.826240},
|
||||
{0.000104, 5.371937, 4.595087, 6.659253},
|
||||
{0.000083, 5.362000, 4.617470, 6.837770},
|
||||
{0.000069, 5.285997, 4.754993, 7.159043},
|
||||
{0.000049, 5.488470, 4.396107, 6.727357},
|
||||
{0.000058, 4.958940, 4.580460, 6.477740},
|
||||
{0.000028, 5.521253, 4.440493, 7.205017},
|
||||
{0.000000, 0.000000, 0.000000, 0.000000},
|
||||
{0.000019, 5.811260, 4.399110, 7.336310},
|
||||
};
|
||||
|
||||
typedef struct encoder_state_t encoder_state_t;
|
||||
|
||||
int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f);
|
||||
void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table);
|
||||
uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state);
|
||||
|
||||
#endif // FAST_COEFF_COST_H_
|
|
@ -379,4 +379,8 @@ typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
|
|||
# define COMPILE_ARM 0
|
||||
#endif
|
||||
|
||||
// Min & max delta QP limits based on bit depth
|
||||
#define KVZ_QP_DELTA_MIN -(26 + 3 * (KVZ_BIT_DEPTH - 8))
|
||||
#define KVZ_QP_DELTA_MAX 25 + 3 * (KVZ_BIT_DEPTH - 8)
|
||||
|
||||
#endif
|
||||
|
|
59
src/image.c
59
src/image.c
|
@ -483,33 +483,46 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
|
|||
ref->stride) >> (KVZ_BIT_DEPTH - 8);
|
||||
} else {
|
||||
// Extrapolate pixels from outside the frame.
|
||||
kvz_extended_block block;
|
||||
kvz_get_extended_block(pic_x,
|
||||
pic_y,
|
||||
ref_x - pic_x,
|
||||
ref_y - pic_y,
|
||||
0,
|
||||
0,
|
||||
ref->y,
|
||||
ref->width,
|
||||
ref->height,
|
||||
0,
|
||||
block_width,
|
||||
block_height,
|
||||
&block);
|
||||
|
||||
// Space for extrapolated pixels and the part from the picture
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[LCU_LUMA_SIZE];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->y,
|
||||
.src_w = ref->width,
|
||||
.src_h = ref->height,
|
||||
.src_s = ref->stride,
|
||||
.blk_x = ref_x,
|
||||
.blk_y = ref_y,
|
||||
.blk_w = block_width,
|
||||
.blk_h = block_height,
|
||||
.pad_l = 0,
|
||||
.pad_r = 0,
|
||||
.pad_t = 0,
|
||||
.pad_b = 0,
|
||||
.pad_b_simd = 0,
|
||||
};
|
||||
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
kvz_get_extended_block(&epol_args);
|
||||
|
||||
const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
|
||||
|
||||
unsigned satd = kvz_satd_any_size(block_width,
|
||||
block_height,
|
||||
pic_data,
|
||||
pic->stride,
|
||||
block.buffer,
|
||||
block.stride) >> (KVZ_BIT_DEPTH - 8);
|
||||
|
||||
if (block.malloc_used) {
|
||||
FREE_POINTER(block.buffer);
|
||||
}
|
||||
block_height,
|
||||
pic_data,
|
||||
pic->stride,
|
||||
ext_origin,
|
||||
ext_s) >> (KVZ_BIT_DEPTH - 8);
|
||||
|
||||
return satd;
|
||||
}
|
||||
|
|
406
src/inter.c
406
src/inter.c
|
@ -40,224 +40,258 @@ typedef struct {
|
|||
} merge_candidates_t;
|
||||
|
||||
|
||||
static void inter_recon_frac_luma(const encoder_state_t * const state,
|
||||
const kvz_picture * const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
lcu_t *lcu)
|
||||
static void inter_recon_frac_luma(const encoder_state_t *const state,
|
||||
const kvz_picture *const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
lcu_t *lcu)
|
||||
{
|
||||
int mv_frac_x = (mv_param[0] & 3);
|
||||
int mv_frac_y = (mv_param[1] & 3);
|
||||
|
||||
// Fractional luma 1/4-pel
|
||||
kvz_extended_block src = {0, 0, 0, 0};
|
||||
// Space for extrapolated pixels and the part from the picture.
|
||||
// Some extra for AVX2.
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->y,
|
||||
.src_w = ref->width,
|
||||
.src_h = ref->height,
|
||||
.src_s = ref->stride,
|
||||
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
|
||||
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
|
||||
.blk_w = block_width,
|
||||
.blk_h = block_height,
|
||||
.pad_l = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_t = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b_simd = 1 // One row for AVX2
|
||||
};
|
||||
|
||||
// Fractional luma
|
||||
kvz_get_extended_block(xpos,
|
||||
ypos,
|
||||
mv_param[0] >> 2,
|
||||
mv_param[1] >> 2,
|
||||
state->tile->offset_x,
|
||||
state->tile->offset_y,
|
||||
ref->y,
|
||||
ref->width,
|
||||
ref->height,
|
||||
KVZ_LUMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src);
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_quarterpel_luma(state->encoder_control,
|
||||
src.orig_topleft,
|
||||
src.stride,
|
||||
block_width,
|
||||
block_height,
|
||||
lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
|
||||
LCU_WIDTH,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
|
||||
if (src.malloc_used) free(src.buffer);
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width,
|
||||
block_height,
|
||||
lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
|
||||
LCU_WIDTH,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
}
|
||||
|
||||
static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
|
||||
const kvz_picture * const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
hi_prec_buf_t *hi_prec_out)
|
||||
static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
|
||||
const kvz_picture *const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
hi_prec_buf_t *hi_prec_out)
|
||||
{
|
||||
int mv_frac_x = (mv_param[0] & 3);
|
||||
int mv_frac_y = (mv_param[1] & 3);
|
||||
|
||||
// Fractional luma 1/4-pel
|
||||
kvz_extended_block src = { 0, 0, 0, 0 };
|
||||
// Space for extrapolated pixels and the part from the picture.
|
||||
// Some extra for AVX2.
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->y,
|
||||
.src_w = ref->width,
|
||||
.src_h = ref->height,
|
||||
.src_s = ref->stride,
|
||||
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
|
||||
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
|
||||
.blk_w = block_width,
|
||||
.blk_h = block_height,
|
||||
.pad_l = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_t = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b_simd = 1 // One row for AVX2
|
||||
};
|
||||
|
||||
// Fractional luma
|
||||
kvz_get_extended_block(xpos,
|
||||
ypos,
|
||||
mv_param[0] >> 2,
|
||||
mv_param[1] >> 2,
|
||||
state->tile->offset_x,
|
||||
state->tile->offset_y,
|
||||
ref->y,
|
||||
ref->width,
|
||||
ref->height,
|
||||
KVZ_LUMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src);
|
||||
kvz_sample_14bit_quarterpel_luma(state->encoder_control,
|
||||
src.orig_topleft,
|
||||
src.stride,
|
||||
block_width,
|
||||
block_height,
|
||||
hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
|
||||
LCU_WIDTH,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
if (src.malloc_used) free(src.buffer);
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_quarterpel_luma_hi(state->encoder_control,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width,
|
||||
block_height,
|
||||
hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
|
||||
LCU_WIDTH,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
}
|
||||
|
||||
static void inter_recon_frac_chroma(const encoder_state_t * const state,
|
||||
const kvz_picture * const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
lcu_t *lcu)
|
||||
static void inter_recon_frac_chroma(const encoder_state_t *const state,
|
||||
const kvz_picture *const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
lcu_t *lcu)
|
||||
{
|
||||
int mv_frac_x = (mv_param[0] & 7);
|
||||
int mv_frac_y = (mv_param[1] & 7);
|
||||
|
||||
// Translate to chroma
|
||||
xpos >>= 1;
|
||||
ypos >>= 1;
|
||||
block_width >>= 1;
|
||||
block_height >>= 1;
|
||||
// Space for extrapolated pixels and the part from the picture.
|
||||
// Some extra for AVX2.
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
|
||||
// Fractional chroma 1/8-pel
|
||||
kvz_extended_block src_u = { 0, 0, 0, 0 };
|
||||
kvz_extended_block src_v = { 0, 0, 0, 0 };
|
||||
// Chroma U
|
||||
// Divisions by 2 due to 4:2:0 chroma subsampling
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->u,
|
||||
.src_w = ref->width / 2,
|
||||
.src_h = ref->height / 2,
|
||||
.src_s = ref->stride / 2,
|
||||
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
|
||||
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
|
||||
.blk_w = block_width / 2,
|
||||
.blk_h = block_height / 2,
|
||||
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_b_simd = 3 // Three rows for AVX2
|
||||
};
|
||||
|
||||
//Fractional chroma U
|
||||
kvz_get_extended_block(xpos, ypos,
|
||||
(mv_param[0] >> 2) >> 1,
|
||||
(mv_param[1] >> 2) >> 1,
|
||||
state->tile->offset_x >> 1,
|
||||
state->tile->offset_y >> 1,
|
||||
ref->u,
|
||||
ref->width >> 1,
|
||||
ref->height >> 1,
|
||||
KVZ_CHROMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src_u);
|
||||
kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
|
||||
block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
//Fractional chroma V
|
||||
kvz_get_extended_block(xpos, ypos,
|
||||
(mv_param[0] >> 2) >> 1,
|
||||
(mv_param[1] >> 2) >> 1,
|
||||
state->tile->offset_x >> 1,
|
||||
state->tile->offset_y >> 1,
|
||||
ref->v,
|
||||
ref->width >> 1,
|
||||
ref->height >> 1,
|
||||
KVZ_CHROMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src_v);
|
||||
kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
|
||||
block_height, lcu->rec.v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_octpel_chroma(state->encoder_control,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width / 2,
|
||||
block_height / 2,
|
||||
lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
|
||||
if (src_u.malloc_used) free(src_u.buffer);
|
||||
if (src_v.malloc_used) free(src_v.buffer);
|
||||
// Chroma V
|
||||
epol_args.src = ref->v;
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_octpel_chroma(state->encoder_control,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width / 2,
|
||||
block_height / 2,
|
||||
lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
}
|
||||
|
||||
static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
|
||||
const kvz_picture * const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
hi_prec_buf_t *hi_prec_out)
|
||||
static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
|
||||
const kvz_picture *const ref,
|
||||
int32_t xpos,
|
||||
int32_t ypos,
|
||||
int32_t block_width,
|
||||
int32_t block_height,
|
||||
const int16_t mv_param[2],
|
||||
hi_prec_buf_t *hi_prec_out)
|
||||
{
|
||||
int mv_frac_x = (mv_param[0] & 7);
|
||||
int mv_frac_y = (mv_param[1] & 7);
|
||||
|
||||
// Translate to chroma
|
||||
xpos >>= 1;
|
||||
ypos >>= 1;
|
||||
block_width >>= 1;
|
||||
block_height >>= 1;
|
||||
// Space for extrapolated pixels and the part from the picture.
|
||||
// Some extra for AVX2.
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
|
||||
// Fractional chroma 1/8-pel
|
||||
kvz_extended_block src_u = { 0, 0, 0, 0 };
|
||||
kvz_extended_block src_v = { 0, 0, 0, 0 };
|
||||
// Chroma U
|
||||
// Divisions by 2 due to 4:2:0 chroma subsampling
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->u,
|
||||
.src_w = ref->width / 2,
|
||||
.src_h = ref->height / 2,
|
||||
.src_s = ref->stride / 2,
|
||||
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
|
||||
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
|
||||
.blk_w = block_width / 2,
|
||||
.blk_h = block_height / 2,
|
||||
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
|
||||
.pad_b_simd = 3 // Three rows for AVX2
|
||||
};
|
||||
|
||||
//Fractional chroma U
|
||||
kvz_get_extended_block(xpos,
|
||||
ypos,
|
||||
(mv_param[0] >> 2) >> 1,
|
||||
(mv_param[1] >> 2) >> 1,
|
||||
state->tile->offset_x >> 1,
|
||||
state->tile->offset_y >> 1,
|
||||
ref->u,
|
||||
ref->width >> 1,
|
||||
ref->height >> 1,
|
||||
KVZ_CHROMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src_u);
|
||||
kvz_sample_14bit_octpel_chroma(state->encoder_control,
|
||||
src_u.orig_topleft,
|
||||
src_u.stride,
|
||||
block_width,
|
||||
block_height,
|
||||
hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
//Fractional chroma V
|
||||
kvz_get_extended_block(xpos,
|
||||
ypos,
|
||||
(mv_param[0] >> 2) >> 1,
|
||||
(mv_param[1] >> 2) >> 1,
|
||||
state->tile->offset_x >> 1,
|
||||
state->tile->offset_y >> 1,
|
||||
ref->v,
|
||||
ref->width >> 1,
|
||||
ref->height >> 1,
|
||||
KVZ_CHROMA_FILTER_TAPS,
|
||||
block_width,
|
||||
block_height,
|
||||
&src_v);
|
||||
kvz_sample_14bit_octpel_chroma(state->encoder_control,
|
||||
src_v.orig_topleft,
|
||||
src_v.stride,
|
||||
block_width,
|
||||
block_height,
|
||||
hi_prec_out->v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_octpel_chroma_hi(state->encoder_control,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width / 2,
|
||||
block_height / 2,
|
||||
hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
|
||||
if (src_u.malloc_used) free(src_u.buffer);
|
||||
if (src_v.malloc_used) free(src_v.buffer);
|
||||
// Chroma V
|
||||
epol_args.src = ref->v;
|
||||
kvz_get_extended_block(&epol_args);
|
||||
kvz_sample_octpel_chroma_hi(state->encoder_control,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
block_width / 2,
|
||||
block_height / 2,
|
||||
hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
|
||||
LCU_WIDTH_C,
|
||||
mv_frac_x,
|
||||
mv_frac_y,
|
||||
mv_param);
|
||||
}
|
||||
|
||||
|
||||
|
@ -348,7 +382,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
|
|||
if (fractional_luma) {
|
||||
// With a fractional MV, do interpolation.
|
||||
if (state->encoder_control->cfg.bipred && hi_prec_out) {
|
||||
inter_recon_14bit_frac_luma(state, ref,
|
||||
inter_recon_frac_luma_hi(state, ref,
|
||||
pu_in_tile.x, pu_in_tile.y,
|
||||
width, height,
|
||||
mv_param, hi_prec_out);
|
||||
|
@ -386,7 +420,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
|
|||
if (fractional_luma || fractional_chroma) {
|
||||
// With a fractional MV, do interpolation.
|
||||
if (state->encoder_control->cfg.bipred && hi_prec_out) {
|
||||
inter_recon_14bit_frac_chroma(state, ref,
|
||||
inter_recon_frac_chroma_hi(state, ref,
|
||||
pu_in_tile.x, pu_in_tile.y,
|
||||
width, height,
|
||||
mv_param, hi_prec_out);
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -473,6 +474,20 @@ typedef struct kvz_config
|
|||
enum kvz_file_format file_format;
|
||||
|
||||
char *stats_file_prefix;
|
||||
char *fast_coeff_table_fn; /*!< \brief Pointer to fast coeff table filename */
|
||||
|
||||
/** \brief whether we're sampling TBs and their costs for fast cost
|
||||
* estimation training */
|
||||
uint8_t rdo_cost_sampling_mode_on;
|
||||
|
||||
/** \brief whether we're running in normal mode, sampling TBs and their cost
|
||||
* for fast estimation training, or comparing estimator accuracy to
|
||||
* CABAC */
|
||||
uint8_t fastrd_sampling_on;
|
||||
uint8_t fastrd_accuracy_check_on;
|
||||
|
||||
char *fastrd_learning_outdir_fn;
|
||||
|
||||
|
||||
struct param_set_map *param_set_map;
|
||||
|
||||
|
|
|
@ -803,9 +803,10 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
|
|||
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
|
||||
int aq_offset = round(state->frame->aq_offsets[id]);
|
||||
state->qp += aq_offset;
|
||||
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Clipping range is a function of bit depth
|
||||
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
|
||||
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
|
||||
state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
|
||||
state->qp = CLIP_TO_QP(state->qp);
|
||||
state->lambda = qp_to_lambda(state, state->qp);
|
||||
state->lambda_sqrt = sqrt(state->lambda);
|
||||
|
@ -1149,9 +1150,10 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
|
|||
int id = lcu_pos.x + lcu_pos.y * state->tile->frame->width_in_lcu;
|
||||
int aq_offset = round(state->frame->aq_offsets[id]);
|
||||
state->qp += aq_offset;
|
||||
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Clipping range is a function of bit depth
|
||||
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
|
||||
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
|
||||
state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
|
||||
state->qp = CLIP_TO_QP(state->qp);
|
||||
state->lambda = qp_to_lambda(state, state->qp);
|
||||
state->lambda_sqrt = sqrt(state->lambda);
|
||||
|
|
140
src/rdo.c
140
src/rdo.c
|
@ -20,8 +20,10 @@
|
|||
|
||||
#include "rdo.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "cabac.h"
|
||||
#include "context.h"
|
||||
|
@ -43,6 +45,11 @@
|
|||
#define LOG2_SCAN_SET_SIZE 4
|
||||
#define SBH_THRESHOLD 4
|
||||
|
||||
#define RD_SAMPLING_MAX_LAST_QP 50
|
||||
|
||||
static FILE *fastrd_learning_outfile[RD_SAMPLING_MAX_LAST_QP + 1] = {NULL};
|
||||
static pthread_mutex_t outfile_mutex[RD_SAMPLING_MAX_LAST_QP + 1];
|
||||
|
||||
const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
|
||||
const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
|
||||
const uint32_t g_auiGoRiceParsCoeff[32] =
|
||||
|
@ -152,6 +159,67 @@ struct sh_rates_t {
|
|||
int32_t quant_delta[32 * 32];
|
||||
};
|
||||
|
||||
int kvz_init_rdcost_outfiles(const char *dir_path)
|
||||
{
|
||||
#define RD_SAMPLING_MAX_FN_LENGTH 4095
|
||||
static const char *basename_tmpl = "/%02i.txt";
|
||||
char fn_template[RD_SAMPLING_MAX_FN_LENGTH + 1];
|
||||
char fn[RD_SAMPLING_MAX_FN_LENGTH + 1];
|
||||
int rv = 0, qp;
|
||||
|
||||
// As long as QP is a two-digit number, template and produced string should
|
||||
// be equal in length ("%i" -> "22")
|
||||
assert(RD_SAMPLING_MAX_LAST_QP <= 99);
|
||||
assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
|
||||
|
||||
strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
|
||||
strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
|
||||
|
||||
for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
|
||||
pthread_mutex_t *curr = outfile_mutex + qp;
|
||||
|
||||
if (pthread_mutex_init(curr, NULL) != 0) {
|
||||
fprintf(stderr, "Failed to create mutex\n");
|
||||
rv = -1;
|
||||
qp--;
|
||||
goto out_destroy_mutexes;
|
||||
}
|
||||
}
|
||||
|
||||
for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
|
||||
FILE *curr;
|
||||
|
||||
snprintf(fn, RD_SAMPLING_MAX_FN_LENGTH, fn_template, qp);
|
||||
fn[RD_SAMPLING_MAX_FN_LENGTH] = 0;
|
||||
curr = fopen(fn, "w");
|
||||
if (curr == NULL) {
|
||||
fprintf(stderr, "Failed to open %s: %s\n", fn, strerror(errno));
|
||||
rv = -1;
|
||||
qp--;
|
||||
goto out_close_files;
|
||||
}
|
||||
fastrd_learning_outfile[qp] = curr;
|
||||
}
|
||||
goto out;
|
||||
|
||||
out_close_files:
|
||||
for (; qp >= 0; qp--) {
|
||||
fclose(fastrd_learning_outfile[qp]);
|
||||
fastrd_learning_outfile[qp] = NULL;
|
||||
}
|
||||
goto out;
|
||||
|
||||
out_destroy_mutexes:
|
||||
for (; qp >= 0; qp--) {
|
||||
pthread_mutex_destroy(outfile_mutex + qp);
|
||||
}
|
||||
goto out;
|
||||
|
||||
out:
|
||||
return rv;
|
||||
#undef RD_SAMPLING_MAX_FN_LENGTH
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief Calculate actual (or really close to actual) bitcost for coding
|
||||
|
@ -205,6 +273,33 @@ static INLINE uint32_t get_coeff_cabac_cost(
|
|||
return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
|
||||
}
|
||||
|
||||
static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
|
||||
{
|
||||
pthread_mutex_t *mtx = outfile_mutex + qp;
|
||||
|
||||
assert(sizeof(coeff_t) == sizeof(int16_t));
|
||||
assert(qp <= RD_SAMPLING_MAX_LAST_QP);
|
||||
|
||||
pthread_mutex_lock(mtx);
|
||||
|
||||
fwrite(&size, sizeof(size), 1, fastrd_learning_outfile[qp]);
|
||||
fwrite(&ccc, sizeof(ccc), 1, fastrd_learning_outfile[qp]);
|
||||
fwrite( coeff, sizeof(coeff_t), size, fastrd_learning_outfile[qp]);
|
||||
|
||||
pthread_mutex_unlock(mtx);
|
||||
}
|
||||
|
||||
static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
|
||||
{
|
||||
pthread_mutex_t *mtx = outfile_mutex + qp;
|
||||
|
||||
assert(qp <= RD_SAMPLING_MAX_LAST_QP);
|
||||
|
||||
pthread_mutex_lock(mtx);
|
||||
fprintf(fastrd_learning_outfile[qp], "%u %u\n", fast_cost, ccc);
|
||||
pthread_mutex_unlock(mtx);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Estimate bitcost for coding coefficients.
|
||||
*
|
||||
|
@ -220,14 +315,32 @@ uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
|
|||
int32_t type,
|
||||
int8_t scan_mode)
|
||||
{
|
||||
if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) {
|
||||
return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
|
||||
uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
|
||||
uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
|
||||
|
||||
if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
|
||||
state->qp < MAX_FAST_COEFF_COST_QP) {
|
||||
// TODO: do we need to assert(0) out of the fast-estimation branch if we
|
||||
// are to save block costs, or should we just warn about it somewhere
|
||||
// earlier (configuration validation I guess)?
|
||||
if (save_cccs) {
|
||||
assert(0 && "Fast RD sampling does not work with fast-residual-cost");
|
||||
return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
|
||||
} else {
|
||||
uint64_t weights = kvz_fast_coeff_get_weights(state);
|
||||
uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
|
||||
if (check_accuracy) {
|
||||
uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
|
||||
save_accuracy(state->qp, ccc, fast_cost);
|
||||
}
|
||||
return fast_cost;
|
||||
}
|
||||
} else {
|
||||
// Estimate coeff coding cost based on QP and sum of absolute coeffs.
|
||||
// const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width);
|
||||
// return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5);
|
||||
return kvz_fast_coeff_cost(coeff, width, state->qp);
|
||||
uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
|
||||
if (save_cccs) {
|
||||
save_ccc(state->qp, coeff, width * width, ccc);
|
||||
}
|
||||
return ccc;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1192,3 +1305,18 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
|
|||
return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
|
||||
}
|
||||
|
||||
void kvz_close_rdcost_outfiles(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < RD_SAMPLING_MAX_LAST_QP; i++) {
|
||||
FILE *curr = fastrd_learning_outfile[i];
|
||||
pthread_mutex_t *curr_mtx = outfile_mutex + i;
|
||||
if (curr != NULL) {
|
||||
fclose(curr);
|
||||
}
|
||||
if (curr_mtx != NULL) {
|
||||
pthread_mutex_destroy(curr_mtx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,6 +36,9 @@
|
|||
extern const uint32_t kvz_g_go_rice_range[5];
|
||||
extern const uint32_t kvz_g_go_rice_prefix_len[5];
|
||||
|
||||
int kvz_init_rdcost_outfiles(const char *fn_template);
|
||||
void kvz_close_rdcost_outfiles(void);
|
||||
|
||||
void kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
|
||||
int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth, uint16_t cbf);
|
||||
|
||||
|
|
|
@ -992,12 +992,11 @@ static void search_frac(inter_search_info_t *info)
|
|||
|
||||
unsigned costs[4] = { 0 };
|
||||
|
||||
kvz_extended_block src = { 0, 0, 0, 0 };
|
||||
ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
|
||||
ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];
|
||||
|
||||
// Storage buffers for intermediate horizontally filtered results.
|
||||
// Have the first columns in contiguous memory for vectorization.
|
||||
ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
|
||||
ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
|
||||
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];
|
||||
|
||||
const kvz_picture *ref = info->ref;
|
||||
|
@ -1013,20 +1012,45 @@ static void search_frac(inter_search_info_t *info)
|
|||
int8_t sample_off_x = 0;
|
||||
int8_t sample_off_y = 0;
|
||||
|
||||
kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
|
||||
state->tile->offset_x,
|
||||
state->tile->offset_y,
|
||||
ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
|
||||
internal_width+1, internal_height+1,
|
||||
&src);
|
||||
// Space for (possibly) extrapolated pixels and the part from the picture
|
||||
// One extra row and column compared to normal interpolation and some extra for AVX2.
|
||||
// The extrapolation function will set the pointers and stride.
|
||||
kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
|
||||
kvz_pixel *ext = NULL;
|
||||
kvz_pixel *ext_origin = NULL;
|
||||
int ext_s = 0;
|
||||
kvz_epol_args epol_args = {
|
||||
.src = ref->y,
|
||||
.src_w = ref->width,
|
||||
.src_h = ref->height,
|
||||
.src_s = ref->stride,
|
||||
.blk_x = state->tile->offset_x + orig.x + mv.x - 1,
|
||||
.blk_y = state->tile->offset_y + orig.y + mv.y - 1,
|
||||
.blk_w = internal_width + 1, // TODO: real width
|
||||
.blk_h = internal_height + 1, // TODO: real height
|
||||
.pad_l = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_t = KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
|
||||
.pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
|
||||
};
|
||||
|
||||
// Initialize separately. Gets rid of warning
|
||||
// about using nonstandard extension.
|
||||
epol_args.buf = ext_buffer;
|
||||
epol_args.ext = &ext;
|
||||
epol_args.ext_origin = &ext_origin;
|
||||
epol_args.ext_s = &ext_s;
|
||||
|
||||
kvz_get_extended_block(&epol_args);
|
||||
|
||||
kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
|
||||
int tmp_stride = pic->stride;
|
||||
|
||||
// Search integer position
|
||||
costs[0] = kvz_satd_any_size(width, height,
|
||||
tmp_pic, tmp_stride,
|
||||
src.orig_topleft + src.stride + 1, src.stride);
|
||||
tmp_pic, tmp_stride,
|
||||
ext_origin + ext_s + 1, ext_s);
|
||||
|
||||
costs[0] += info->mvd_cost_func(state,
|
||||
mv.x, mv.y, 2,
|
||||
|
@ -1056,8 +1080,8 @@ static void search_frac(inter_search_info_t *info)
|
|||
const int mv_shift = (step < 2) ? 1 : 0;
|
||||
|
||||
filter_steps[step](state->encoder_control,
|
||||
src.orig_topleft,
|
||||
src.stride,
|
||||
ext_origin,
|
||||
ext_s,
|
||||
internal_width,
|
||||
internal_height,
|
||||
filtered,
|
||||
|
@ -1131,8 +1155,6 @@ static void search_frac(inter_search_info_t *info)
|
|||
info->best_mv = mv;
|
||||
info->best_cost = best_cost;
|
||||
info->best_bitcost = best_bitcost;
|
||||
|
||||
if (src.malloc_used) free(src.buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -40,6 +40,7 @@
|
|||
#include "strategyselector.h"
|
||||
#include "tables.h"
|
||||
#include "transform.h"
|
||||
#include "fast_coeff_cost.h"
|
||||
|
||||
static INLINE int32_t hsum32_8x32i(__m256i src)
|
||||
{
|
||||
|
@ -814,81 +815,63 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
|
|||
return parts[0] + parts[1] + parts[2] + parts[3];
|
||||
}
|
||||
|
||||
#define TO_Q88(f) ((int16_t)((f) * 256.0f))
|
||||
|
||||
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t qp)
|
||||
static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
|
||||
{
|
||||
#define NUM_BUCKETS 5
|
||||
static const int16_t wt_m[NUM_BUCKETS] = {
|
||||
TO_Q88(-0.004916),
|
||||
TO_Q88( 0.010806),
|
||||
TO_Q88( 0.055562),
|
||||
TO_Q88( 0.033436),
|
||||
TO_Q88(-0.007690),
|
||||
};
|
||||
static const int16_t wt_c[NUM_BUCKETS] = {
|
||||
TO_Q88( 0.172024),
|
||||
TO_Q88( 3.421462),
|
||||
TO_Q88( 2.879506),
|
||||
TO_Q88( 5.585471),
|
||||
TO_Q88( 0.256772),
|
||||
};
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i threes = _mm256_set1_epi16(3);
|
||||
const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
|
||||
const __m128i wt_extract_los = _mm_cvtsi32_si128(0x06040200);
|
||||
const __m128i wt_extract_his = _mm_cvtsi32_si128(0x07050301);
|
||||
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i threes = _mm256_set1_epi16(3);
|
||||
const __m256i ones = _mm256_srli_epi16(threes, 1);
|
||||
const __m256i twos = _mm256_slli_epi16(ones, 1);
|
||||
__m256i lo_sum = _mm256_setzero_si256();
|
||||
__m256i hi_sum = _mm256_setzero_si256();
|
||||
|
||||
__m256i wt[NUM_BUCKETS - 1];
|
||||
for (int32_t i = 0; i < NUM_BUCKETS - 1; i++)
|
||||
wt[i] = _mm256_set1_epi16(wt_m[i] * qp + wt_c[i]);
|
||||
__m128i wts_128 = _mm_loadl_epi64 ((const __m128i *)&weights);
|
||||
__m128i wts_lo_128 = _mm_shuffle_epi8(wts_128, wt_extract_los);
|
||||
__m128i wts_hi_128 = _mm_shuffle_epi8(wts_128, wt_extract_his);
|
||||
|
||||
uint32_t wid_wt = width * (wt_m[NUM_BUCKETS - 1] * qp + wt_c[NUM_BUCKETS - 1]);
|
||||
__m256i avx_inc = _mm256_setzero_si256();
|
||||
__m256i wts_lo = _mm256_broadcastsi128_si256(wts_lo_128);
|
||||
__m256i wts_hi = _mm256_broadcastsi128_si256(wts_hi_128);
|
||||
|
||||
for (int32_t i = 0; i < width * width; i += 16) {
|
||||
__m256i curr = _mm256_loadu_si256((__m256i *)(coeff + i));
|
||||
__m256i curr_abs = _mm256_abs_epi16 (curr);
|
||||
__m256i curr_max3 = _mm256_min_epi16 (curr_abs, threes);
|
||||
for (int i = 0; i < width * width; i += 32) {
|
||||
__m256i curr_lo = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
|
||||
__m256i curr_abs_lo = _mm256_abs_epi16 (curr_lo);
|
||||
__m256i curr_max3_lo = _mm256_min_epu16 (curr_abs_lo, threes);
|
||||
|
||||
__m256i curr_eq_0 = _mm256_cmpeq_epi16(curr_max3, zero);
|
||||
__m256i curr_eq_1 = _mm256_cmpeq_epi16(curr_max3, ones);
|
||||
__m256i curr_eq_2 = _mm256_cmpeq_epi16(curr_max3, twos);
|
||||
__m256i curr_eq_3 = _mm256_cmpeq_epi16(curr_max3, threes);
|
||||
// 4x4 blocks only have 16 coeffs, so handle them separately
|
||||
__m256i curr_max3_hi;
|
||||
if (width >= 8) {
|
||||
__m256i curr_hi = _mm256_loadu_si256 ((const __m256i *)(coeff + i + 16));
|
||||
__m256i curr_abs_hi = _mm256_abs_epi16 (curr_hi);
|
||||
curr_max3_hi = _mm256_min_epu16 (curr_abs_hi, threes);
|
||||
curr_max3_hi = _mm256_slli_epi16 (curr_max3_hi, 8);
|
||||
} else {
|
||||
// Set MSBs for high bytes if they're meaningless, so shuffles will
|
||||
// return zeros for them
|
||||
curr_max3_hi = negate_hibytes;
|
||||
}
|
||||
__m256i curr_max3 = _mm256_or_si256 (curr_max3_lo, curr_max3_hi);
|
||||
__m256i curr_wts_lo = _mm256_shuffle_epi8(wts_lo, curr_max3);
|
||||
__m256i curr_wts_hi = _mm256_shuffle_epi8(wts_hi, curr_max3);
|
||||
|
||||
__m256i curr_0_wt = _mm256_and_si256 (curr_eq_0, wt[0]);
|
||||
__m256i curr_1_wt = _mm256_and_si256 (curr_eq_1, wt[1]);
|
||||
__m256i curr_2_wt = _mm256_and_si256 (curr_eq_2, wt[2]);
|
||||
__m256i curr_3_wt = _mm256_and_si256 (curr_eq_3, wt[3]);
|
||||
__m256i curr_sum_lo = _mm256_sad_epu8 (curr_wts_lo, zero);
|
||||
__m256i curr_sum_hi = _mm256_sad_epu8 (curr_wts_hi, zero);
|
||||
|
||||
// Use madd to horizontally sum 16-bit weights into 32-bit atoms
|
||||
__m256i wt_0_32b = _mm256_madd_epi16(curr_0_wt, ones);
|
||||
__m256i wt_1_32b = _mm256_madd_epi16(curr_1_wt, ones);
|
||||
__m256i wt_2_32b = _mm256_madd_epi16(curr_2_wt, ones);
|
||||
__m256i wt_3_32b = _mm256_madd_epi16(curr_3_wt, ones);
|
||||
|
||||
__m256i wt_01 = _mm256_add_epi32(wt_0_32b, wt_1_32b);
|
||||
__m256i wt_23 = _mm256_add_epi32(wt_2_32b, wt_3_32b);
|
||||
__m256i curr_wts = _mm256_add_epi32(wt_01, wt_23);
|
||||
avx_inc = _mm256_add_epi32(avx_inc, curr_wts);
|
||||
lo_sum = _mm256_add_epi64 (lo_sum, curr_sum_lo);
|
||||
hi_sum = _mm256_add_epi64 (hi_sum, curr_sum_hi);
|
||||
}
|
||||
__m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
|
||||
__m128i inclo = _mm256_castsi256_si128 (avx_inc);
|
||||
hi_sum = _mm256_slli_epi64(hi_sum, 8);
|
||||
__m256i sum0 = _mm256_add_epi64(lo_sum, hi_sum);
|
||||
|
||||
__m128i sum_1 = _mm_add_epi32 (inclo, inchi);
|
||||
__m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sum_3 = _mm_add_epi32 (sum_1, sum_2);
|
||||
__m128i sum_4 = _mm_shuffle_epi32(sum_3, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128i sum = _mm_add_epi32 (sum_3, sum_4);
|
||||
__m256i sum1 = _mm256_permute4x64_epi64(sum0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum2 = _mm256_add_epi64 (sum0, sum1);
|
||||
__m256i sum3 = _mm256_shuffle_epi32 (sum2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum4 = _mm256_add_epi64 (sum2, sum3);
|
||||
|
||||
uint32_t sum_u32 = _mm_cvtsi128_si32(sum);
|
||||
uint32_t sum_total = sum_u32 + wid_wt;
|
||||
return sum_total >> 8;
|
||||
#undef NUM_BUCKETS
|
||||
__m128i sum128 = _mm256_castsi256_si128 (sum4);
|
||||
return (_mm_cvtsi128_si32(sum128) + (1 << 7)) >> 8;
|
||||
}
|
||||
|
||||
#undef TO_Q88
|
||||
|
||||
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
||||
|
||||
int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
|
||||
|
|
|
@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
|
|||
}
|
||||
}
|
||||
|
||||
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
|
||||
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
|
||||
{
|
||||
//TODO: horizontal and vertical only filtering
|
||||
int32_t x, y;
|
||||
|
@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
|
|||
int16_t src_stride,
|
||||
int width,
|
||||
int height,
|
||||
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
|
||||
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
|
||||
kvz_pixel filtered[4][LCU_LUMA_SIZE],
|
||||
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
|
||||
int8_t fme_level,
|
||||
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
int8_t hpel_off_x, int8_t hpel_off_y)
|
||||
|
@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
|
|||
int16_t src_stride,
|
||||
int width,
|
||||
int height,
|
||||
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
|
||||
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
|
||||
kvz_pixel filtered[4][LCU_LUMA_SIZE],
|
||||
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
|
||||
int8_t fme_level,
|
||||
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
int8_t hpel_off_x, int8_t hpel_off_y)
|
||||
|
@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
|
|||
int16_t src_stride,
|
||||
int width,
|
||||
int height,
|
||||
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
|
||||
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
|
||||
kvz_pixel filtered[4][LCU_LUMA_SIZE],
|
||||
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
|
||||
int8_t fme_level,
|
||||
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
int8_t hpel_off_x, int8_t hpel_off_y)
|
||||
|
@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
|
|||
int16_t src_stride,
|
||||
int width,
|
||||
int height,
|
||||
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
|
||||
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
|
||||
kvz_pixel filtered[4][LCU_LUMA_SIZE],
|
||||
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
|
||||
int8_t fme_level,
|
||||
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
int8_t hpel_off_x, int8_t hpel_off_y)
|
||||
|
@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
|
|||
}
|
||||
}
|
||||
|
||||
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
|
||||
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
|
||||
{
|
||||
//TODO: horizontal and vertical only filtering
|
||||
int32_t x, y;
|
||||
|
@ -728,58 +728,54 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
|
|||
}
|
||||
|
||||
|
||||
void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
|
||||
int filter_size, int width, int height, kvz_extended_block *out) {
|
||||
void kvz_get_extended_block_generic(kvz_epol_args *args) {
|
||||
|
||||
int half_filter_size = filter_size >> 1;
|
||||
int min_y = args->blk_y - args->pad_t;
|
||||
int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
|
||||
bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
|
||||
|
||||
out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
|
||||
out->stride = ref_width;
|
||||
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
|
||||
out->malloc_used = 0;
|
||||
int min_x = args->blk_x - args->pad_l;
|
||||
int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
|
||||
bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
|
||||
|
||||
int min_y = ypos - half_filter_size + off_y + mv_y;
|
||||
int max_y = min_y + height + filter_size;
|
||||
int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
|
||||
if (out_of_bounds_y || out_of_bounds_x) {
|
||||
|
||||
int min_x = xpos - half_filter_size + off_x + mv_x;
|
||||
int max_x = min_x + width + filter_size;
|
||||
int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
|
||||
*args->ext = args->buf;
|
||||
*args->ext_s = args->pad_l + args->blk_w + args->pad_r;
|
||||
*args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
|
||||
|
||||
int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
|
||||
// Note that stride equals width here.
|
||||
int cnt_l = CLIP(0, *args->ext_s, -min_x);
|
||||
int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
|
||||
int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
|
||||
|
||||
if (sample_out_of_bounds){
|
||||
out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
|
||||
if (!out->buffer){
|
||||
fprintf(stderr, "Memory allocation failed!\n");
|
||||
assert(0);
|
||||
// For each row including real padding.
|
||||
// Don't read "don't care" values (SIMD padding). Zero them out.
|
||||
int y;
|
||||
for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
|
||||
|
||||
int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
|
||||
kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
|
||||
kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
|
||||
kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
|
||||
kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
|
||||
kvz_pixel *dst_m = dst_l + cnt_l;
|
||||
kvz_pixel *dst_r = dst_m + cnt_m;
|
||||
for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
|
||||
for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
|
||||
for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
|
||||
}
|
||||
out->stride = width + filter_size;
|
||||
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
|
||||
out->malloc_used = 1;
|
||||
|
||||
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
|
||||
|
||||
for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
|
||||
|
||||
// calculate y-pixel offset
|
||||
coord_y = y + off_y + mv_y;
|
||||
coord_y = CLIP(0, (ref_height)-1, coord_y);
|
||||
coord_y *= ref_width;
|
||||
|
||||
if (!out_of_bounds_x){
|
||||
memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
|
||||
} else {
|
||||
for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
|
||||
|
||||
coord_x = x + off_x + mv_x;
|
||||
coord_x = CLIP(0, (ref_width)-1, coord_x);
|
||||
|
||||
// Store source block data (with extended borders)
|
||||
out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
|
||||
}
|
||||
}
|
||||
for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
|
||||
kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
|
||||
FILL_ARRAY(dst, 0, *args->ext_s);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
*args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
|
||||
*args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
|
||||
*args->ext_s = args->src_s;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -793,8 +789,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
|
||||
|
||||
return success;
|
||||
|
|
|
@ -32,9 +32,9 @@
|
|||
|
||||
int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
|
||||
void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
|
||||
|
||||
#endif //STRATEGIES_IPOL_GENERIC_H_
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "strategies/strategies-quant.h"
|
||||
#include "strategyselector.h"
|
||||
#include "transform.h"
|
||||
#include "fast_coeff_cost.h"
|
||||
|
||||
#define QUANT_SHIFT 14
|
||||
/**
|
||||
|
@ -342,46 +343,30 @@ static uint32_t coeff_abs_sum_generic(const coeff_t *coeffs, size_t length)
|
|||
return sum;
|
||||
}
|
||||
|
||||
static INLINE int16_t to_q88(float f)
|
||||
static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
|
||||
{
|
||||
return (int16_t)(f * 256.0f);
|
||||
weights[0] = (wts_packed >> 0) & 0xffff;
|
||||
weights[1] = (wts_packed >> 16) & 0xffff;
|
||||
weights[2] = (wts_packed >> 32) & 0xffff;
|
||||
weights[3] = (wts_packed >> 48) & 0xffff;
|
||||
}
|
||||
|
||||
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp)
|
||||
static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
|
||||
{
|
||||
uint32_t sum = 0;
|
||||
#define NUM_BUCKETS 5
|
||||
const int16_t wt_m[NUM_BUCKETS] = {
|
||||
to_q88(-0.004916),
|
||||
to_q88(0.010806),
|
||||
to_q88(0.055562),
|
||||
to_q88(0.033436),
|
||||
to_q88(-0.007690),
|
||||
};
|
||||
const int16_t wt_c[NUM_BUCKETS] = {
|
||||
to_q88(0.172024),
|
||||
to_q88(3.421462),
|
||||
to_q88(2.879506),
|
||||
to_q88(5.585471),
|
||||
to_q88(0.256772),
|
||||
};
|
||||
uint16_t weights_unpacked[4];
|
||||
|
||||
int16_t wt[NUM_BUCKETS];
|
||||
for (int32_t i = 0; i < NUM_BUCKETS; i++)
|
||||
wt[i] = wt_m[i] * qp + wt_c[i];
|
||||
get_coeff_weights(weights, weights_unpacked);
|
||||
|
||||
for (int32_t i = 0; i < width * width; i++) {
|
||||
int16_t curr = coeff[i];
|
||||
int16_t signmask = curr >> 15;
|
||||
int16_t curr_abs = (curr ^ signmask) - signmask;
|
||||
if (curr_abs > 3)
|
||||
int16_t curr = coeff[i];
|
||||
uint32_t curr_abs = abs(curr);
|
||||
if (curr_abs > 3) {
|
||||
curr_abs = 3;
|
||||
|
||||
sum += wt[curr_abs];
|
||||
}
|
||||
sum += weights_unpacked[curr_abs];
|
||||
}
|
||||
sum += wt[NUM_BUCKETS - 1] * width;
|
||||
return sum >> 8;
|
||||
#undef NUM_BUCKETS
|
||||
return (sum + (1 << 7)) >> 8;
|
||||
}
|
||||
|
||||
int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
|
||||
|
|
|
@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
|
|||
epol_func *kvz_get_extended_block;
|
||||
kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
|
||||
kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
|
||||
kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
|
||||
kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
|
||||
kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
|
||||
kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
|
||||
|
||||
|
||||
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
|
||||
|
|
|
@ -31,21 +31,63 @@
|
|||
#include "kvazaar.h"
|
||||
#include "search_inter.h"
|
||||
|
||||
// AVX2 implementation of horizontal filter reads and
|
||||
// writes two rows for luma and four for chroma at a time.
|
||||
// Extra vertical padding is added to prevent segfaults.
|
||||
// Horizontal padding is not needed even if one extra byte
|
||||
// is read because kvz_image_alloc adds enough padding.
|
||||
#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
|
||||
#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
|
||||
#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
|
||||
#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
|
||||
|
||||
// On top of basic interpolation, FME needs one extra
|
||||
// column and row for ME (left and up). Adding the
|
||||
// extra row happens to satisfy AVX2 requirements for
|
||||
// row count. No other extra rows are needed.
|
||||
#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))
|
||||
|
||||
typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
|
||||
|
||||
typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
|
||||
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
|
||||
int8_t sample_off_x, int8_t sample_off_y);
|
||||
|
||||
typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
|
||||
int filter_size, int width, int height, kvz_extended_block *out);
|
||||
typedef struct {
|
||||
// Source samples
|
||||
kvz_pixel *src; // Top-left sample
|
||||
int src_w; // Width
|
||||
int src_h; // Height
|
||||
int src_s; // Stride
|
||||
|
||||
// Requested sampling position, base dimensions, and padding
|
||||
int blk_x;
|
||||
int blk_y;
|
||||
int blk_w; // Width
|
||||
int blk_h; // Height
|
||||
int pad_l; // Left
|
||||
int pad_r; // Right
|
||||
int pad_t; // Top
|
||||
int pad_b; // Bottom
|
||||
int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
|
||||
|
||||
// Buffer for possible extrapolation. Free memory provided by the caller.
|
||||
kvz_pixel *buf;
|
||||
|
||||
// Extended block data. These are set by the function.
|
||||
kvz_pixel **ext; // Top-left sample with padding
|
||||
kvz_pixel **ext_origin; // Top-left sample without padding
|
||||
int *ext_s; // Stride
|
||||
} kvz_epol_args;
|
||||
|
||||
typedef void(epol_func)(kvz_epol_args *args);
|
||||
|
||||
|
||||
typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
|
||||
typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
|
||||
|
||||
// Declare function pointers.
|
||||
extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
|
||||
|
@ -55,8 +97,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
|
|||
extern epol_func * kvz_get_extended_block;
|
||||
extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
|
||||
extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
|
||||
extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
|
||||
extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
|
||||
extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
|
||||
extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
|
||||
|
||||
|
||||
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
|
||||
|
@ -69,8 +111,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
|
|||
{"filter_qpel_blocks_diag_luma", (void**) &kvz_filter_qpel_blocks_diag_luma}, \
|
||||
{"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
|
||||
{"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
|
||||
{"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
|
||||
{"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
|
||||
{"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
|
||||
{"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
|
||||
{"get_extended_block", (void**) &kvz_get_extended_block}, \
|
||||
|
||||
|
||||
|
|
|
@ -32,7 +32,6 @@
|
|||
#include "kvazaar.h"
|
||||
#include "tables.h"
|
||||
|
||||
|
||||
// Declare function pointers.
|
||||
typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
|
||||
int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
|
||||
|
@ -45,7 +44,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
|
|||
bool early_skip);
|
||||
typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
|
||||
int32_t height, int8_t type, int8_t block_type);
|
||||
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t qp);
|
||||
typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
|
||||
|
||||
typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
|
||||
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
#include "encoderstate.h"
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
|
||||
|
||||
extern const uint8_t kvz_g_chroma_scale[58];
|
||||
extern const int16_t kvz_g_inv_quant_scales[6];
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
race:kvz_eight_tap_filter_hor_8x1_avx2
|
||||
# AVX2 interpolation reads some extra pixels
|
||||
race:kvz_ipol_8tap_hor_px_im_avx2
|
||||
race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
|
||||
race:kvz_eight_tap_filter_hor_avx2
|
Loading…
Reference in a new issue