Merge remote-tracking branch 'remotes/kvz_github/master' into Fix-monochrome

# Conflicts: # .gitlab-ci.yml # build/kvazaar_lib/kvazaar_lib.vcxproj.filters # src/cfg.c # src/encoder.h # src/kvazaar.h # src/rdo.c
2024-11-23 18:14:06 +00:00 · 2021-04-23 10:56:50 +03:00 · 2021-04-23 10:56:50 +03:00 · 1aaa95601c
parent 764d23cdf5 c36d423a8c
commit 1aaa95601c
46 changed files with 2174 additions and 1235 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -33,7 +33,7 @@ test-asan:
 #  variables:
 #    CFLAGS: '-fsanitize=thread'
 #    # Temporarily suppress known errors or false positives.
-#    TSAN_OPTIONS: 'suppressions=/builds/TIE/ultravideo/kvazaar/tests/tsan_suppressions.txt'
+#    TSAN_OPTIONS: 'suppressions=/builds/cs/ultravideo/kvazaar/tests/tsan_suppressions.txt'

 test-ubsan:
  <<: *test-template
--- a/README.md
+++ b/README.md
@ -117,6 +117,7 @@ Options:
                               bits, lambda, distortion, and qp for each ctu.
                               These are meant for debugging and are not
                               written unless the prefix is defined.
+
 Video structure:
  -q, --qp <integer>         : Quantization parameter [22]
  -p, --period <integer>     : Period of intra pictures [64]
@ -148,11 +149,11 @@ Video structure:
                                   - N: Target N bits per second.
      --rc-algorithm <string>: Select used rc-algorithm. [lambda]
                                   - lambda: rate control from:
-                                     DOI: 10.1109/TIP.2014.2336550
+                                     DOI: 10.1109/TIP.2014.2336550 
                                   - oba: DOI: 10.1109/TCSVT.2016.2589878
      --(no-)intra-bits      : Use Hadamard cost based allocation for intra
                               frames. Default on for gop 8 and off for lp-gop
-      --(no-)clip-neighbour  : On oba based rate control whether to clip
+      --(no-)clip-neighbour  : On oba based rate control whether to clip 
                               lambda values to same frame's ctus or previous'.
                               Default on for RA GOPS and disabled for LP.
      --(no-)lossless        : Use lossless coding. [disabled]
@ -253,6 +254,16 @@ Compression tools:
                                   - sensitive: Terminate even earlier.
      --fast-residual-cost <int> : Skip CABAC cost for residual coefficients
                                   when QP is below the limit. [0]
+      --fast-coeff-table <string> : Read custom weights for residual
+                                    coefficients from a file instead of using
+                                    defaults [default]
+      --fast-rd-sampling : Enable learning data sampling for fast coefficient
+                           table generation
+      --fastrd-accuracy-check : Evaluate the accuracy of fast coefficient
+                                prediction
+      --fastrd-outdir : Directory to which to output sampled data or accuracy
+                        data, into <fastrd-outdir>/0.txt to 50.txt, one file
+                        for each QP that blocks were estimated on
      --(no-)intra-rdo-et    : Check intra modes in rdo stage only until
                               a zero coefficient CU is found. [disabled]
      --(no-)early-skip      : Try to find skip cu from merge candidates.
--- a/appveyor.yml
+++ b/appveyor.yml
@ -1,8 +1,3 @@
-# Only the whitelisted branches get built, regardless of build config
-branches:
-  only:
-    - master
-
 # Email the author if their commit either failed to build or fixed a failed build
 # good -> bad, bad -> bad, bad -> good  but not  good -> good
 notifications:
@ -37,13 +32,16 @@ configuration:
  - Release

 # Build with multiple compilers / build suites
-image: Visual Studio 2015
 environment:
  matrix:
-    - platform: Win32
-    - platform: x64
-    - MSYSTEM: MINGW32
-    - MSYSTEM: MINGW64
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      platform: Win32
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      platform: x64
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      MSYSTEM: MINGW32
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      MSYSTEM: MINGW64

 for:
 -
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@ -164,6 +164,7 @@
    <ClCompile Include="..\..\src\nal.c" />
    <ClCompile Include="..\..\src\rate_control.c" />
    <ClCompile Include="..\..\src\rdo.c" />
+    <ClCompile Include="..\..\src\fast_coeff_cost.c" />
    <ClCompile Include="..\..\src\sao.c" />
    <ClCompile Include="..\..\src\scalinglist.c" />
    <ClCompile Include="..\..\src\search.c" />
@ -290,6 +291,7 @@
    <ClInclude Include="..\..\src\nal.h" />
    <ClInclude Include="..\..\src\rate_control.h" />
    <ClInclude Include="..\..\src\rdo.h" />
+    <ClInclude Include="..\..\src\fast_coeff_cost.h" />
    <ClInclude Include="..\..\src\sao.h" />
    <ClInclude Include="..\..\src\scalinglist.h" />
    <ClInclude Include="..\..\src\search.h" />
@ -337,4 +339,4 @@
  <ImportGroup Label="ExtensionTargets">
    <Import Project="..\yasm\vsyasm.targets" />
  </ImportGroup>
-</Project>
+</Project>
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@ -174,6 +174,12 @@
    <ClCompile Include="..\..\src\rdo.c">
      <Filter>Compression</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\fast_coeff_cost.c">
+      <Filter>Compression</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\inter.c">
+      <Filter>Reconstruction</Filter>
+    </ClCompile>
    <ClCompile Include="..\..\src\intra.c">
      <Filter>Reconstruction</Filter>
    </ClCompile>
@ -342,6 +348,9 @@
    <ClInclude Include="..\..\src\rdo.h">
      <Filter>Compression</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\fast_coeff_cost.h">
+      <Filter>Compression</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\src\strategies\strategies-common.h">
      <Filter>Optimization\strategies</Filter>
    </ClInclude>
--- a/configure.ac
+++ b/configure.ac
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=3
+ver_minor=5
 ver_release=0

 # Prevents configure from adding a lot of defines to the CFLAGS
--- a/doc/kvazaar.1
+++ b/doc/kvazaar.1
@ -1,4 +1,4 @@
-.TH KVAZAAR "1" "September 2020" "kvazaar v2.0.0" "User Commands"
+.TH KVAZAAR "1" "January 2021" "kvazaar v2.0.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
@ -106,6 +106,7 @@ A prefix used for stats files that include
 bits, lambda, distortion, and qp for each ctu.
 These are meant for debugging and are not
 written unless the prefix is defined.
+
 .SS "Video structure:"
 .TP
 \fB\-q\fR, \fB\-\-qp <integer>        
@ -326,6 +327,24 @@ Motion estimation termination [on]
 Skip CABAC cost for residual coefficients
    when QP is below the limit. [0]
 .TP
+\fB\-\-fast\-coeff\-table <string>
+Read custom weights for residual
+     coefficients from a file instead of using
+     defaults [default]
+.TP
+\fB\-\-fast\-rd\-sampling
+Enable learning data sampling for fast coefficient
+                           table generation
+.TP
+\fB\-\-fastrd\-accuracy\-check
+Evaluate the accuracy of fast coefficient
+ prediction
+.TP
+\fB\-\-fastrd\-outdir
+Directory to which to output sampled data or accuracy
+                        data, into <fastrd\-outdir>/0.txt to 50.txt, one file
+                        for each QP that blocks were estimated on
+.TP
 \fB\-\-(no\-)intra\-rdo\-et   
 Check intra modes in rdo stage only until
 a zero coefficient CU is found. [disabled]
--- a/examples/README.md
+++ b/examples/README.md
@ -0,0 +1,14 @@
+Examples
+========
+Examples of external files for use with Kvazaar.
+
+## Region of interest (roi) files
+A simple text file can be used with the `--roi` switch to setup regions of interest for encoding.
+Header row of the file will tell how many regions the encoded frames are divided (columns, rows).
+The header must be followed by a data row with number entries equal to columns * rows.
+The data row will tell the encoder which delta QP value will be assigned to each region.
+The included example file will split frames into four regions with the top regions having a delta QP of +5
+```
+2 2
+5 5 0 0
+``` 
--- a/examples/fast_coeff_table.txt
+++ b/examples/fast_coeff_table.txt
@ -0,0 +1,51 @@
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.046152 4.874163 3.830968 6.617950
+0.040648 4.920004 3.922710 6.572261
+0.033854 4.982197 4.021474 6.518219
+0.027073 5.056451 4.082557 6.471514
+0.021064 5.125763 4.113825 6.436425
+0.016605 5.170554 4.119091 6.423091
+0.012953 5.196849 4.128659 6.422746
+0.010218 5.194947 4.166336 6.431305
+0.007970 5.177114 4.217242 6.429468
+0.006442 5.138598 4.275070 6.396064
+0.005184 5.093265 4.337876 6.352651
+0.004134 5.046189 4.413434 6.310742
+0.003239 5.001028 4.492965 6.264692
+0.002689 4.959881 4.569652 6.198468
+0.002280 4.920991 4.642861 6.123074
+0.001940 4.886799 4.709124 6.049688
+0.001631 4.858057 4.767754 5.986929
+0.001409 4.839546 4.813134 5.951025
+0.001223 4.823649 4.856675 5.933274
+0.001055 4.806288 4.904500 5.940060
+0.000899 4.789201 4.950018 5.955955
+0.000781 4.776673 4.981798 5.982144
+0.000683 4.766721 5.006732 6.019175
+0.000603 4.757364 5.030649 6.081959
+0.000529 4.746016 5.059187 6.158720
+0.000460 4.729670 5.100437 6.254217
+0.000397 4.711187 5.150631 6.364452
+0.000345 4.692304 5.213098 6.506122
+0.000300 4.674471 5.279962 6.667672
+0.000264 4.660182 5.342776 6.836979
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
+0.000237 4.649543 5.392507 6.977093
--- a/examples/roi.txt
+++ b/examples/roi.txt
@ -0,0 +1,2 @@
+2 2
+5 5 0 0
--- a/rdcost-weight-tool/README.txt
+++ b/rdcost-weight-tool/README.txt
@ -0,0 +1,35 @@
+To extract the block costs, build Kvazaar as usual, and edit relevant
+parameters in the beginning of extract_rdcosts.py and run_filter.py, most
+importantly the number of cores and the set of video sequences you want to
+encode to extract costs. Run extract_rdcosts.py, it will use Kvazaar to encode
+each sequence and extract the costs measured there for the quantized blocks.
+The costs are stored compressed and sorted by block QP, in the following
+format:
+
+Size (B)  | Description
+----------+------------
+4         | size:   Coeff group size, in int16's
+4         | ccc:    Coeff group's coding cost
+size * 2  | coeffs: Coeff group data
+
+To analyze the costs by running a linear regression over them, build the two
+tools using:
+
+$ gcc filter_rdcosts.c -O2 -o frcosts_matrix
+$ gcc ols_2ndpart.c -O2 -o ols_2ndpart
+
+Then run the regression in parallel by running run_filter.py. The reason to do
+it this way is because the data is stored compressed, so there is no way to
+mmap it in Matlab/Octave/something; the data sets are absolutely huge (larger
+than reasonable amounts of RAM in a decent workstation), but this way we can
+store the data compressed and process it in O(1) memory complexity, so it can
+be done as widely parallelized as you have CPU cores. The result files each
+consist of 4 numbers, which represent an approximate linear solution to the
+corresponding set of costs: the price in bits of a coefficient whose absolute
+value is a) 0, b) 1, c) 2, d) 3 or higher.
+
+After that, run rdcost_do_avg.py. It will calculate a per-QP average of the
+costs over the set of the sequences having been run (ie. for each QP, take the
+results for that QP for each sequence, and calculate their average). This data
+is what you can use to fill in the default_fast_coeff_cost_wts table in
+src/fast_coeff_cost.h.
--- a/rdcost-weight-tool/build.sh
+++ b/rdcost-weight-tool/build.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+
+gcc -O2 filter_rdcosts.c -o frcosts_matrix
+gcc -O2 ols_2ndpart.c -o ols_2ndpart
--- a/rdcost-weight-tool/extract_rdcosts.py
+++ b/rdcost-weight-tool/extract_rdcosts.py
@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+import glob
+import gzip
+import os
+import subprocess
+import threading
+import time
+
+# Where logs and sampled data will wind up, and where the sequences are read.
+# Do note that the sequences variable is supposed to be a tuple, because you
+# could have multiple sets of sequences.
+logdir    =  "/tmp/rdcost/logs"
+ofdir     =  "/tmp/rdcost/data"
+sequences = ("/opt/test_seqs/custom_seqs/*/*.yuv",)
+
+# Note that n_kvazaars * len(dest_qps) has to be less than the max number of
+# fd's that a process can have (check it out: ulimit -a, likely 1024)
+smt_threads   = 8 # Kinda lazy, but just match this to your cpu
+n_kvz_threads = 1 # How many threads each kvz instance is running?
+n_kvazaars    = smt_threads // n_kvz_threads
+
+# You likely will not need to change anything below this line
+kvz_srcdir    = lambda path: os.path.join(
+                                 os.path.dirname(
+                                     os.path.dirname(
+                                         os.path.realpath(__file__)
+                                     )
+                                 ), "src", path)
+
+
+dest_qps      = tuple(range(51))
+base_qps      = tuple(range(12, 43))
+
+kvzargs       = [kvz_srcdir("kvazaar"), "--threads", str(n_kvz_threads), "--preset=ultrafast", "--fastrd-sampling", "--fast-residual-cost=0"]
+kvzenv        = {"LD_LIBRARY_PATH": kvz_srcdir(".libs/")}
+
+class MultiPipeGZOutManager:
+    pipe_fn_template  = "%02i.txt"
+    gzout_fn_template = "%02i.txt.gz"
+
+    def __init__(self, odpath, dest_qps):
+        self.odpath = odpath
+        self.dest_qps = dest_qps
+
+        self.pipe_fns  = []
+        self.gzout_fns = []
+        for qp in dest_qps:
+            pipe_fn  = os.path.join(self.odpath, self.pipe_fn_template % qp)
+            gzout_fn = os.path.join(self.odpath, self.gzout_fn_template % qp)
+
+            self.pipe_fns.append(pipe_fn)
+            self.gzout_fns.append(gzout_fn)
+
+    def __enter__(self):
+        os.makedirs(self.odpath, exist_ok=True)
+        for pipe_fn in self.pipe_fns:
+            try:
+                os.unlink(pipe_fn)
+            except FileNotFoundError:
+                pass
+            os.mkfifo(pipe_fn)
+        return self
+
+    def __exit__(self, *_):
+        for pipe_fn in self.pipe_fns:
+            os.unlink(pipe_fn)
+
+    def items(self):
+        for pipe_fn, gzout_fn in zip(self.pipe_fns, self.gzout_fns):
+            yield (pipe_fn, gzout_fn)
+
+class MTSafeIterable:
+    def __init__(self, iterable):
+        self.lock = threading.Lock()
+        self.iterable = iterable
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            return next(self.iterable)
+
+def combinations(xi, yi):
+    for x in xi:
+        for y in yi:
+            yield (x, y)
+
+def chain(lol):
+    for l in lol:
+        for i in l:
+            yield i
+
+# Would've used Popen with gzip, but "gzip [fifo]" with an unconnected fifo
+# will detect the situation and not block, but just consider it an empty
+# file. Don't like it when tools outsmart their user..
+def do_gzip(in_fn, out_fn):
+    BLOCK_SZ = 65536
+    PRINT_MULT = 1024
+    with open(in_fn, "rb") as inf, gzip.open(out_fn, "wb") as outf:
+        num_read = 0
+        print_next_thres = BLOCK_SZ * PRINT_MULT
+        while True:
+            block = inf.read(BLOCK_SZ)
+            num_read += len(block)
+            if (num_read >= print_next_thres):
+                print("    read     %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
+                print_next_thres += BLOCK_SZ * PRINT_MULT
+
+            if (len(block) == 0):
+                break
+            outf.write(block)
+
+        print("    finished %8i MB from %s" % (num_read / (1024 * 1024), in_fn))
+
+def run_job(job):
+    ifpath, qp = job
+    ifname = os.path.basename(ifpath)
+
+    jobname  = "%s-qp%i" % (ifname, qp)
+    hevcname = "%s.hevc" % jobname
+    logname  = "%s.log"  % jobname
+    odname   = jobname
+
+    hevcpath = os.path.join("/tmp", hevcname)
+    logpath  = os.path.join(logdir, logname)
+    odpath   = os.path.join(ofdir,  odname)
+
+    my_kvzargs = kvzargs + ["-i",              ifpath,
+                            "--qp",            str(qp),
+                            "-o",              hevcpath,
+                            "--fastrd-outdir", odpath]
+
+    with open(logpath, "w") as lf:
+        with MultiPipeGZOutManager(odpath, dest_qps) as pipes_and_outputs:
+            gzip_threads = []
+            for pipe_fn, out_fn in pipes_and_outputs.items():
+                gzip_thread = threading.Thread(target=do_gzip, args=(pipe_fn, out_fn))
+                gzip_thread.start()
+                gzip_threads.append(gzip_thread)
+
+            kvz = subprocess.Popen(my_kvzargs, env=kvzenv, stderr=lf)
+            kvz.wait()
+
+def threadfunc(joblist):
+    for job in joblist:
+        run_job(job)
+
+def main():
+    assert(isinstance(sequences, tuple))
+    for d in (logdir, ofdir):
+        os.makedirs(d, exist_ok=True)
+
+    jobs = combinations(chain(map(glob.glob, sequences)), base_qps)
+    joblist = MTSafeIterable(jobs)
+
+    threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_kvazaars)]
+    for thread in threads:
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if (__name__ == "__main__"):
+    main()
--- a/rdcost-weight-tool/filter_rdcosts.c
+++ b/rdcost-weight-tool/filter_rdcosts.c
@ -0,0 +1,134 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define BUFSZ (64 * 64 * sizeof(uint16_t))
+#define NUM_COEFF_BUCKETS (4)
+#define NUM_OTHER_BUCKETS (0)
+#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
+#define MAX_COEFF_BUCKET  ((NUM_COEFF_BUCKETS) - 1)
+
+#define clz(x) __builtin_clz(x)
+#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
+
+void print_coeffs(const int16_t *buf, uint32_t size, uint32_t ccc)
+{
+  uint32_t i;
+  printf("Buf size %u, ccc %u\n", size, ccc);
+  for (i = 0; i < size; i++)
+    printf("%i ", buf[i]);
+  printf("\n");
+}
+
+void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs, uint16_t *excess)
+{
+  *excess = 0;
+  uint32_t i;
+
+  for (i = 0; i < size; i++) {
+    int16_t curr = buf[i];
+    int16_t is_signed = curr >> 15;
+    *num_signs += (is_signed & 1);
+
+    uint16_t abs = (curr ^ is_signed) - is_signed;
+    if (abs > MAX_COEFF_BUCKET) {
+      *excess += abs - MAX_COEFF_BUCKET;
+      abs = MAX_COEFF_BUCKET;
+    }
+
+    buckets[abs]++;
+  }
+}
+
+void print_buckets(const uint64_t *buckets, uint64_t num_signs)
+{
+  uint32_t i;
+  for (i = 0; i < NUM_COEFF_BUCKETS; i++)
+    printf("%3u: %lu\n", i, buckets[i]);
+  printf("Signs: %lu\n", num_signs);
+}
+
+void update_matrix(const uint64_t *buckets, uint64_t *mat)
+{
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      int curr_pos = y * NUM_TOTAL_BUCKETS + x;
+      mat[curr_pos] += buckets[x] * buckets[y];
+    }
+  }
+}
+
+static inline int is_power_of_two(uint32_t u)
+{
+  return (u & (u - 1)) == 0;
+}
+
+int process_rdcosts(FILE *in, FILE *out)
+{
+  void *buf = malloc(BUFSZ);
+  uint32_t *u32buf = (uint32_t *)buf;
+  int16_t  *i16buf = (int16_t  *)buf;
+  int rv = 0;
+
+  float weights[NUM_TOTAL_BUCKETS] = {0.0f};
+
+  uint64_t mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0};
+
+  while (!feof(in)) {
+    uint32_t size, ccc, size_sqrt;
+    uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
+    uint64_t cg_num_signs = 0;
+    uint16_t excess = 0;
+    size_t   n_read;
+
+    n_read = fread(buf, sizeof(uint32_t), 2, in);
+    size = u32buf[0];
+    ccc  = u32buf[1];
+
+    // Can't rely on feof() alone when reading from a pipe that might only get
+    // closed long after the last data has been poured in
+    if (n_read == 0) {
+      break;
+    }
+    if (feof(in) || n_read < 2) {
+      fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
+      rv = 1;
+      goto out;
+    }
+    if (!is_power_of_two(size)) {
+      fprintf(stderr, "Errorneous block size %u\n", size);
+      rv = 1;
+      goto out;
+    }
+
+    size_sqrt = 1 << (ilog2(size) >> 1);
+    n_read = fread(buf, sizeof(int16_t), size, in);
+    if (n_read != size) {
+      fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
+      rv = 1;
+      goto out;
+    }
+
+    count_coeffs(i16buf, size, cg_buckets, &cg_num_signs, &excess);
+    update_matrix(cg_buckets, mat);
+  }
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      int curr_pos = y * NUM_TOTAL_BUCKETS + x;
+      printf("%lu ", mat[curr_pos]);
+    }
+    printf("\n");
+  }
+  fflush(stdout);
+
+out:
+  free(buf);
+  return rv;
+}
+
+int main(int ar, char **av)
+{
+  return process_rdcosts(stdin, stdout);
+}
--- a/rdcost-weight-tool/invert_matrix.m
+++ b/rdcost-weight-tool/invert_matrix.m
@ -0,0 +1,3 @@
+A = dlmread("/dev/stdin");
+B = inv(A);
+dlmwrite("/dev/stdout", B, " ");
--- a/rdcost-weight-tool/ols_2ndpart.c
+++ b/rdcost-weight-tool/ols_2ndpart.c
@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#define BUFSZ (64 * 64 * sizeof(uint16_t))
+#define NUM_COEFF_BUCKETS (4)
+#define NUM_OTHER_BUCKETS (0)
+#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
+#ifdef ERR_SQUARED
+#define STEPSIZE (0.00000001f * 0.000001f)
+#else
+#define STEPSIZE (0.00000001f)
+#endif
+
+#define clz(x) __builtin_clz(x)
+#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
+#define coord(x,y,w) ((x)+((y)*(w)))
+
+void update_result(const uint64_t *buckets, uint64_t ccc, const double *mat, double *res)
+{
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    double addend = 0.0;
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      addend += mat[coord(x, y, NUM_TOTAL_BUCKETS)] * (double)buckets[x];
+    }
+    addend *= (double)ccc;
+    res[y] += addend;
+  }
+}
+
+void read_matrix(const char *fn, double *mat)
+{
+  FILE *f = fopen(fn, "r");
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      float curr;
+      fscanf(f, "%f", &curr);
+      mat[x + y * NUM_TOTAL_BUCKETS] = curr;
+    }
+  }
+  fclose(f);
+}
+
+void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs)
+{
+  uint32_t i;
+  for (i = 0; i < size; i++) {
+    int16_t curr = buf[i];
+    int16_t is_signed = curr >> 15;
+    *num_signs += (is_signed & 1);
+
+    uint16_t abs = (curr ^ is_signed) - is_signed;
+    if (abs >= NUM_COEFF_BUCKETS)
+      abs = NUM_COEFF_BUCKETS - 1;
+
+    buckets[abs]++;
+  }
+}
+
+static inline int is_power_of_two(uint32_t u)
+{
+  return (u & (u - 1)) == 0;
+}
+
+int process_rdcosts(FILE *in, FILE *out, const double *mat)
+{
+  void *buf = malloc(BUFSZ);
+  uint32_t *u32buf = (uint32_t *)buf;
+  int16_t  *i16buf = (int16_t  *)buf;
+  int rv = 0;
+
+  double res[NUM_TOTAL_BUCKETS] = {0.0};
+
+  while (!feof(in)) {
+    uint32_t size, ccc, size_sqrt;
+    uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
+    uint64_t cg_num_signs = 0;
+    size_t   n_read;
+
+    n_read = fread(buf, sizeof(uint32_t), 2, in);
+    size = u32buf[0];
+    ccc  = u32buf[1];
+
+    // Can't rely on feof() alone when reading from a pipe that might only get
+    // closed long after the last data has been poured in
+    if (n_read == 0) {
+      break;
+    }
+    if (feof(in) || n_read < 2) {
+      fprintf(stderr, "Unexpected EOF when reading header, managed still to read %u u32's\n", n_read);
+      rv = 1;
+      goto out;
+    }
+    if (!is_power_of_two(size)) {
+      fprintf(stderr, "Errorneous block size %u\n", size);
+      rv = 1;
+      goto out;
+    }
+
+    size_sqrt = 1 << (ilog2(size) >> 1);
+
+    n_read = fread(buf, sizeof(int16_t), size, in);
+    if (n_read != size) {
+      fprintf(stderr, "Unexpected EOF when reading block, managed still to read %u i16's\n", n_read);
+      rv = 1;
+      goto out;
+    }
+
+    count_coeffs(i16buf, size, cg_buckets, &cg_num_signs);
+    update_result(cg_buckets, ccc, mat, res);
+  }
+
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++)
+    fprintf(out, "%g\n", (float)(res[y]));
+
+out:
+  free(buf);
+  return rv;
+}
+
+int main(int ar, char **av)
+{
+  double mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0.0};
+  if (ar != 2) {
+    fprintf(stderr, "gib matrix plz\n");
+    return 1;
+  }
+  read_matrix(av[1], mat);
+  return process_rdcosts(stdin, stdout, mat);
+}
+
--- a/rdcost-weight-tool/rdcost_do_avg.py
+++ b/rdcost-weight-tool/rdcost_do_avg.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+import glob
+import sys
+
+result_path_template = "/tmp/rdcost/coeff_buckets/*-qp%02i.result"
+
+def main():
+    results = []
+    for qp in range(51):
+        curr_sums = [0.0] * 4
+        curr_count = 0
+        result_files = glob.glob(result_path_template % qp)
+        for fn in result_files:
+            with open(fn) as f:
+                contents = f.readlines()
+                if (len(contents) != 4):
+                    print("Faulty file contents at %s, skipping" % fn, file=sys.stderr)
+                    continue
+                nums = tuple(map(float, contents))
+                if (all(n == 0.0 for n in nums)):
+                    print("All-zero file %s, skipping" % fn)
+                    continue
+
+                curr_count += 1
+                for i in range(len(curr_sums)):
+                    curr_sums[i] += nums[i]
+
+        if (curr_count > 0):
+            curr_avgs = tuple(curr_sum / curr_count for curr_sum in curr_sums)
+        else:
+            curr_avgs = (0, 0, 0, 0)
+
+        results.append(curr_avgs)
+    print("\n".join(("QP %2i: " % i + ", ".join("%.6f" for _ in range(4)) % res for i, res in enumerate(results))))
+
+if (__name__ == "__main__"):
+    main()
--- a/rdcost-weight-tool/run_filter.py
+++ b/rdcost-weight-tool/run_filter.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+import glob
+import gzip
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+
+# You should change these to your liking
+n_threads   = 8
+datadirs    = "/tmp/rdcost/data/"
+resultdir   = "/tmp/rdcost/coeff_buckets"
+
+gzargs      = ["gzip", "-d"]
+filtargs    = ["./frcosts_matrix"]
+octargs     = ["octave-cli", "invert_matrix.m"]
+filt2args   = ["./ols_2ndpart"]
+
+class MultiPipeManager:
+    pipe_fn_template  = "%02i.txt"
+
+    def __init__(self, odpath, dest_qps):
+        self.odpath = odpath
+        self.dest_qps = dest_qps
+
+        self.pipe_fns  = []
+        for qp in dest_qps:
+            pipe_fn  = os.path.join(self.odpath, self.pipe_fn_template % qp)
+            self.pipe_fns.append(pipe_fn)
+
+    def __enter__(self):
+        os.makedirs(self.odpath, exist_ok=True)
+        for pipe_fn in self.pipe_fns:
+            try:
+                os.unlink(pipe_fn)
+            except FileNotFoundError:
+                pass
+            os.mkfifo(pipe_fn)
+        return self
+
+    def __exit__(self, *_):
+        for pipe_fn in self.pipe_fns:
+            os.unlink(pipe_fn)
+
+    def items(self):
+        for pipe_fn in self.pipe_fns:
+            yield pipe_fn
+
+class MTSafeIterable:
+    def __init__(self, iterable):
+        self.lock = threading.Lock()
+        self.iterable = iterable
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            return next(self.iterable)
+
+def read_in_blocks(f):
+    BLOCK_SZ = 65536
+    while True:
+        block = f.read(BLOCK_SZ)
+        if (len(block) == 0):
+            break
+        else:
+            yield block
+
+def exhaust_gzs(sink_f, gzs):
+    for gz in gzs:
+        with gzip.open(gz, "rb") as f:
+            if (gz == "/tmp/rdcost/data/RaceHorses_416x240_30.yuv-qp22/20.txt.gz"):
+                print("kjeh")
+            print("  Doing %s ..." % gz)
+            for block in read_in_blocks(f):
+                sink_f.write(block)
+                sink_f.flush()
+
+def run_job(jobname, input_gzs):
+    resultpath = os.path.join(resultdir, "%s.result" % jobname)
+    print("Running job %s" % jobname)
+
+    with tempfile.NamedTemporaryFile() as tf:
+        filt = subprocess.Popen(filtargs, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        octa = subprocess.Popen(octargs, stdin=filt.stdout, stdout=tf)
+
+        try:
+            exhaust_gzs(filt.stdin, input_gzs)
+        except OSError as e:
+            print("OSError %s" % e, file=sys.stderr)
+            raise
+
+        filt.stdin.close()
+        filt.wait()
+        octa.wait()
+
+        if (filt.returncode != 0):
+            print("First stage failed: %s" % jobname, file=sys.stderr)
+            assert(0)
+
+        with open(resultpath, "w") as rf:
+            f2a = filt2args + [tf.name]
+            f2 = subprocess.Popen(f2a, stdin=subprocess.PIPE, stdout=rf)
+            exhaust_gzs(f2.stdin, input_gzs)
+            f2.communicate()
+            if (filt.returncode != 0):
+                print("Second stage failed: %s" % jobname, file=sys.stderr)
+                assert(0)
+
+    print("Job %s done" % jobname)
+
+def threadfunc(joblist):
+    for jobname, job in joblist:
+        run_job(jobname, job)
+
+def scan_datadirs(path):
+    seq_names = set()
+    for dirent in os.scandir(path):
+        if (not dirent.is_dir()):
+            continue
+        match = re.search("^([A-Za-z0-9_]+\.yuv)-qp[0-9]{1,2}$", dirent.name)
+        if (not match is None):
+            seq_name = match.groups()[0]
+            seq_names.add(seq_name)
+
+    for seq_name in seq_names:
+        seq_glob = os.path.join(path, seq_name + "-qp*/")
+
+        for qp in range(51):
+            job_name = seq_name + "-qp%02i" % qp
+            qp_fn = "%02i.txt.gz" % qp
+            yield job_name, glob.glob(os.path.join(seq_glob, qp_fn))
+
+def main():
+    for d in (datadirs, resultdir):
+        os.makedirs(d, exist_ok=True)
+
+    jobs = scan_datadirs(datadirs)
+    joblist = MTSafeIterable(iter(jobs))
+
+    threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_threads)]
+    for thread in threads:
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if (__name__ == "__main__"):
+    main()
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -73,6 +73,8 @@ libkvazaar_la_SOURCES = \
 	encoder_state-geometry.h \
 	encode_coding_tree.c \
 	encode_coding_tree.h \
+	fast_coeff_cost.c \
+	fast_coeff_cost.h \
 	filter.c \
 	filter.h \
 	global.h \
--- a/src/cfg.c
+++ b/src/cfg.c
@ -81,6 +81,7 @@ int kvz_config_init(kvz_config *cfg)
  cfg->vui.chroma_loc  = 0; /* left center */
  cfg->aud_enable      = 0;
  cfg->cqmfile         = NULL;
+  cfg->fast_coeff_table_fn = NULL;
  cfg->ref_frames      = 1;
  cfg->gop_len         = 4;
  cfg->gop_lowdelay    = true;
@ -176,6 +177,10 @@ int kvz_config_init(kvz_config *cfg)

  cfg->stats_file_prefix = NULL;

+  cfg->fastrd_sampling_on = 0;
+  cfg->fastrd_accuracy_check_on = 0;
+  cfg->fastrd_learning_outdir_fn = NULL;
+
  int8_t in[] = { 17, 27, 32, 44 };
  int8_t out[] = { 17, 29, 34, 41 };

@ -196,11 +201,13 @@ int kvz_config_destroy(kvz_config *cfg)
 {
  if (cfg) {
    FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->fast_coeff_table_fn);
    FREE_POINTER(cfg->tiles_width_split);
    FREE_POINTER(cfg->tiles_height_split);
    FREE_POINTER(cfg->slice_addresses_in_ts);
    FREE_POINTER(cfg->roi.dqps);
    FREE_POINTER(cfg->optional_key);
+    FREE_POINTER(cfg->fastrd_learning_outdir_fn);
    if (cfg->param_set_map)
    {
      FREE_POINTER(cfg->param_set_map);
@ -904,6 +911,30 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
    cfg->cqmfile = cqmfile;
    cfg->scaling_list = KVZ_SCALING_LIST_CUSTOM;
  }
+  else if OPT("fast-coeff-table") {
+    char* fast_coeff_table_fn = strdup(value);
+    if (!fast_coeff_table_fn) {
+      fprintf(stderr, "Failed to allocate memory for fast coeff table file name.\n");
+      return 0;
+    }
+    FREE_POINTER(cfg->fast_coeff_table_fn);
+    cfg->fast_coeff_table_fn = fast_coeff_table_fn;
+  }
+  else if OPT("fastrd-sampling") {
+    cfg->fastrd_sampling_on = 1;
+  }
+  else if OPT("fastrd-accuracy-check") {
+    cfg->fastrd_accuracy_check_on = 1;
+  }
+  else if OPT("fastrd-outdir") {
+    char *fastrd_learning_outdir_fn = strdup(value);
+    if (!fastrd_learning_outdir_fn) {
+      fprintf(stderr, "Failed to allocate memory for fast RD learning outfile name.\n");
+      return 0;
+    }
+    FREE_POINTER(cfg->fastrd_learning_outdir_fn);
+    cfg->fastrd_learning_outdir_fn = fastrd_learning_outdir_fn;
+  }
  else if OPT("scaling-list") {    
    int8_t scaling_list = KVZ_SCALING_LIST_OFF;
    int result = parse_enum(value, scaling_list_names, &scaling_list);
--- a/src/cli.c
+++ b/src/cli.c
@ -155,6 +155,10 @@ static const struct option long_options[] = {
  { "no-clip-neighbour",        no_argument, NULL, 0 },
  { "input-file-format",  required_argument, NULL, 0 },
  { "stats-file-prefix",  required_argument, NULL, 0 },
+  { "fast-coeff-table",   required_argument, NULL, 0 },
+  { "fastrd-sampling",          no_argument, NULL, 0 },
+  { "fastrd-accuracy-check",    no_argument, NULL, 0 },
+  { "fastrd-outdir",      required_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -577,6 +581,16 @@ void print_help(void)
    "                                   - sensitive: Terminate even earlier.\n"
    "      --fast-residual-cost <int> : Skip CABAC cost for residual coefficients\n"
    "                                   when QP is below the limit. [0]\n"
+    "      --fast-coeff-table <string> : Read custom weights for residual\n"
+    "                                    coefficients from a file instead of using\n"
+    "                                    defaults [default]\n"
+    "      --fast-rd-sampling : Enable learning data sampling for fast coefficient\n"
+    "                           table generation\n"
+    "      --fastrd-accuracy-check : Evaluate the accuracy of fast coefficient\n"
+    "                                prediction\n"
+    "      --fastrd-outdir : Directory to which to output sampled data or accuracy\n"
+    "                        data, into <fastrd-outdir>/0.txt to 50.txt, one file\n"
+    "                        for each QP that blocks were estimated on\n"
    "      --(no-)intra-rdo-et    : Check intra modes in rdo stage only until\n"
    "                               a zero coefficient CU is found. [disabled]\n"
    "      --(no-)early-skip      : Try to find skip cu from merge candidates.\n"
--- a/src/encmain.c
+++ b/src/encmain.c
@ -279,7 +279,11 @@ done:
  // Do some cleaning up.
  args->api->picture_free(frame_in);

+  // This thread exit call causes problems with media auto-build suite
+  // The environment compiles with MINGW using a different pthreads lib
+  #if !defined(__MINGW32__) && !defined(__MINGW64__)
  pthread_exit(NULL);
+  #endif
  return NULL;
 }

--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -25,6 +25,7 @@
 #include "cu.h"
 #include "encoder.h"
 #include "extras/crypto.h"
+#include "global.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@ -351,8 +352,9 @@ static void encode_transform_coeff(encoder_state_t * const state,
    if (state->must_code_qp_delta) {
      const int qp_pred      = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
      const int qp_delta     = cur_cu->qp - qp_pred;
-      assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
-      assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
+      // Possible deltaQP range depends on bit depth as stated in HEVC specification.
+      assert(qp_delta >= KVZ_QP_DELTA_MIN && qp_delta <= KVZ_QP_DELTA_MAX && "QP delta not in valid range.");
+
      const int qp_delta_abs = ABS(qp_delta);
      cabac_data_t* cabac    = &state->cabac;

--- a/src/encoder.c
+++ b/src/encoder.c
@ -28,9 +28,10 @@

 #include "cfg.h"
 #include "gop.h"
+#include "rdo.h"
 #include "strategyselector.h"
 #include "kvz_math.h"
-
+#include "fast_coeff_cost.h"

 /**
 * \brief Strength of QP adjustments when using adaptive QP for 360 video.
@ -275,6 +276,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
  encoder->cfg.tiles_width_split = NULL;
  encoder->cfg.tiles_height_split = NULL;
  encoder->cfg.slice_addresses_in_ts = NULL;
+  encoder->cfg.fast_coeff_table_fn = NULL;

  if (encoder->cfg.gop_len > 0) {
    if (encoder->cfg.gop_lowdelay) {
@ -287,7 +289,8 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
  } 
  
  if( encoder->cfg.intra_qp_offset_auto ) {
-      encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? -kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1 : 0;
+    // Limit offset to -3 since HM/VTM seems to use it even for 32 frame gop
+    encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? MAX(-(int8_t)kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1, -3) : 0;
  }

  // Disable GOP and QP offset for all-intra coding
@ -381,6 +384,31 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
    encoder->scaling_list.use_default_list = 1;
  }

+  if (cfg->fast_coeff_table_fn) {
+    FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
+    if (fast_coeff_table_f == NULL) {
+      fprintf(stderr, "Could not open fast coeff table file.\n");
+      goto init_failed;
+    }
+    if (kvz_fast_coeff_table_parse(&encoder->fast_coeff_table, fast_coeff_table_f) != 0) {
+      fprintf(stderr, "Failed to parse fast coeff table, using default\n");
+      kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
+    }
+    fclose(fast_coeff_table_f);
+  } else {
+    kvz_fast_coeff_use_default_table(&encoder->fast_coeff_table);
+  }
+
+  if (cfg->fastrd_sampling_on || cfg->fastrd_accuracy_check_on) {
+    if (cfg->fastrd_learning_outdir_fn == NULL) {
+      fprintf(stderr, "No output file defined for Fast RD sampling or accuracy check.\n");
+      goto init_failed;
+    }
+    if (kvz_init_rdcost_outfiles(cfg->fastrd_learning_outdir_fn) != 0) {
+      goto init_failed;
+    }
+  }
+
  kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth);

  kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height);
@ -742,6 +770,8 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
    if (encoder->qp_map[i]) FREE_POINTER(encoder->qp_map[i]);
  }

+  kvz_close_rdcost_outfiles();
+
  free(encoder);
 }

--- a/src/encoder.h
+++ b/src/encoder.h
@ -30,7 +30,7 @@
 #include "kvazaar.h"
 #include "scalinglist.h"
 #include "threadqueue.h"
-
+#include "fast_coeff_cost.h"

 /* Encoder control options, the main struct */
 typedef struct encoder_control_t
@ -135,6 +135,8 @@ typedef struct encoder_control_t

  int32_t poc_lsb_bits;

+  fast_coeff_table_t fast_coeff_table;
+
  int8_t* qp_map[3];

 } encoder_control_t;
--- a/src/estimate.m
+++ b/src/estimate.m
@ -0,0 +1,5 @@
+data = dlmread("/dev/stdin", " ");
+coeffs = data(1:end, 1:5);
+costs = data(1:end, 6);
+[beta, sigma, r] = ols(costs, coeffs);
+disp(beta)
--- a/src/fast_coeff_cost.c
+++ b/src/fast_coeff_cost.c
@ -0,0 +1,56 @@
+#include "fast_coeff_cost.h"
+#include "kvazaar.h"
+#include "encoderstate.h"
+
+// Note: Assumes that costs are non-negative, for pretty obvious reasons
+static uint16_t to_q88(float f)
+{
+  return (uint16_t)(f * 256.0f + 0.5f);
+}
+
+static uint64_t to_4xq88(const float f[4])
+{
+  int i;
+  uint64_t result = 0;
+
+  for (i = 3; i >= 0; i--) {
+    result <<= 16;
+    result |= to_q88(f[i]);
+  }
+  return result;
+}
+
+int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f)
+{
+  int i;
+  uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
+
+  for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
+    float curr_wts[4];
+
+    if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
+                                                    curr_wts + 1,
+                                                    curr_wts + 2,
+                                                    curr_wts + 3) != 4) {
+      return 1;
+    }
+    wts_by_qp[i] = to_4xq88(curr_wts);
+  }
+  return 0;
+}
+
+void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table)
+{
+  int i;
+  uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
+
+  for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
+    wts_by_qp[i] = to_4xq88(default_fast_coeff_cost_wts[i]);
+  }
+}
+
+uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state)
+{
+  const fast_coeff_table_t *table = &(state->encoder_control->fast_coeff_table);
+  return table->wts_by_qp[state->qp];
+}
--- a/src/fast_coeff_cost.h
+++ b/src/fast_coeff_cost.h
@ -0,0 +1,78 @@
+#ifndef FAST_COEFF_COST_H_
+#define FAST_COEFF_COST_H_
+
+#include <stdio.h>
+#include "kvazaar.h"
+// #include "encoderstate.h"
+
+#define MAX_FAST_COEFF_COST_QP 50
+
+typedef struct {
+  uint64_t wts_by_qp[MAX_FAST_COEFF_COST_QP];
+} fast_coeff_table_t;
+
+// Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
+// 0 to MAX_FAST_COEFF_COST_QP
+static const float default_fast_coeff_cost_wts[][4] = {
+  // Just extend it by stretching the first actual values..
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  // up to here
+  {0.164240, 4.161530, 3.509033, 6.928047},
+  {0.162844, 4.055940, 3.564467, 6.861493},
+  {0.128729, 4.311973, 3.942837, 6.935403},
+  {0.110956, 4.433190, 3.945753, 6.877697},
+  {0.095026, 4.483547, 4.194173, 6.781540},
+  {0.075046, 4.633703, 4.084193, 6.698600},
+  {0.052426, 4.967223, 4.027210, 6.549197},
+  {0.040219, 5.141820, 3.982650, 6.461557},
+  {0.035090, 5.192493, 3.830950, 6.418477},
+  {0.029845, 5.211647, 3.815457, 6.345440},
+  {0.023522, 5.322213, 3.816537, 6.360677},
+  {0.021305, 5.225923, 3.842700, 6.325787},
+  {0.015878, 5.183090, 3.956003, 6.329680},
+  {0.010430, 5.099230, 4.176803, 6.305400},
+  {0.008433, 5.030257, 4.237587, 6.270133},
+  {0.006500, 4.969247, 4.339397, 6.217827},
+  {0.004929, 4.923500, 4.442413, 6.183523},
+  {0.003715, 4.915583, 4.429090, 6.125320},
+  {0.003089, 4.883907, 4.562790, 6.156447},
+  {0.002466, 4.881063, 4.629883, 6.142643},
+  {0.002169, 4.882493, 4.646313, 6.127663},
+  {0.002546, 4.793337, 4.837413, 6.199270},
+  {0.001314, 4.808853, 4.828337, 6.243437},
+  {0.001154, 4.862603, 4.846883, 6.205523},
+  {0.000984, 4.866403, 4.859330, 6.240893},
+  {0.000813, 4.856633, 4.924527, 6.293413},
+  {0.001112, 4.789260, 5.009880, 6.433540},
+  {0.000552, 4.760747, 5.090447, 6.599380},
+  {0.000391, 4.961447, 5.111033, 6.756370},
+  {0.000332, 4.980953, 5.138127, 6.867420},
+  {0.000201, 5.181957, 4.740160, 6.460997},
+  {0.000240, 5.185390, 4.874840, 6.819093},
+  {0.000130, 5.270350, 4.734213, 6.826240},
+  {0.000104, 5.371937, 4.595087, 6.659253},
+  {0.000083, 5.362000, 4.617470, 6.837770},
+  {0.000069, 5.285997, 4.754993, 7.159043},
+  {0.000049, 5.488470, 4.396107, 6.727357},
+  {0.000058, 4.958940, 4.580460, 6.477740},
+  {0.000028, 5.521253, 4.440493, 7.205017},
+  {0.000000, 0.000000, 0.000000, 0.000000},
+  {0.000019, 5.811260, 4.399110, 7.336310},
+};
+
+typedef struct encoder_state_t encoder_state_t;
+
+int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_coeff_table_f);
+void kvz_fast_coeff_use_default_table(fast_coeff_table_t *fast_coeff_table);
+uint64_t kvz_fast_coeff_get_weights(const encoder_state_t *state);
+
+#endif // FAST_COEFF_COST_H_
--- a/src/global.h
+++ b/src/global.h
@ -379,4 +379,8 @@ typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
 #  define COMPILE_ARM 0
 #endif

+// Min & max delta QP limits based on bit depth
+#define KVZ_QP_DELTA_MIN -(26 + 3 * (KVZ_BIT_DEPTH - 8))
+#define KVZ_QP_DELTA_MAX 25 + 3 * (KVZ_BIT_DEPTH - 8)
+
 #endif
--- a/src/image.c
+++ b/src/image.c
@ -483,33 +483,46 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
                             ref->stride) >> (KVZ_BIT_DEPTH - 8);
  } else {
    // Extrapolate pixels from outside the frame.
-    kvz_extended_block block;
-    kvz_get_extended_block(pic_x,
-                           pic_y,
-                           ref_x - pic_x,
-                           ref_y - pic_y,
-                           0,
-                           0,
-                           ref->y,
-                           ref->width,
-                           ref->height,
-                           0,
-                           block_width,
-                           block_height,
-                           &block);
+
+    // Space for extrapolated pixels and the part from the picture
+    // The extrapolation function will set the pointers and stride.
+    kvz_pixel ext_buffer[LCU_LUMA_SIZE];
+    kvz_pixel *ext = NULL;
+    kvz_pixel *ext_origin = NULL;
+    int ext_s = 0;
+    kvz_epol_args epol_args = {
+      .src = ref->y,
+      .src_w = ref->width,
+      .src_h = ref->height,
+      .src_s = ref->stride,
+      .blk_x = ref_x,
+      .blk_y = ref_y,
+      .blk_w = block_width,
+      .blk_h = block_height,
+      .pad_l = 0,
+      .pad_r = 0,
+      .pad_t = 0,
+      .pad_b = 0,
+      .pad_b_simd = 0,
+    };
+
+    // Initialize separately. Gets rid of warning
+    // about using nonstandard extension.
+    epol_args.buf = ext_buffer;
+    epol_args.ext = &ext;
+    epol_args.ext_origin = &ext_origin;
+    epol_args.ext_s = &ext_s;
+
+    kvz_get_extended_block(&epol_args);

    const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];

    unsigned satd = kvz_satd_any_size(block_width,
-                                      block_height,
-                                      pic_data,
-                                      pic->stride,
-                                      block.buffer,
-                                      block.stride) >> (KVZ_BIT_DEPTH - 8);
-
-    if (block.malloc_used) {
-      FREE_POINTER(block.buffer);
-    }
+      block_height,
+      pic_data,
+      pic->stride,
+      ext_origin,
+      ext_s) >> (KVZ_BIT_DEPTH - 8);

    return satd;
  }
--- a/src/inter.c
+++ b/src/inter.c
@ -40,224 +40,258 @@ typedef struct {
 } merge_candidates_t;


-static void inter_recon_frac_luma(const encoder_state_t * const state,
-                                  const kvz_picture * const ref,
-                                  int32_t xpos,
-                                  int32_t ypos,
-                                  int32_t block_width,
-                                  int32_t block_height,
-                                  const int16_t mv_param[2],
-                                  lcu_t *lcu)
+static void inter_recon_frac_luma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
  int mv_frac_x = (mv_param[0] & 3);
  int mv_frac_y = (mv_param[1] & 3);

-  // Fractional luma 1/4-pel
-  kvz_extended_block src = {0, 0, 0, 0};
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
+  };

-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
+  kvz_get_extended_block(&epol_args);
  kvz_sample_quarterpel_luma(state->encoder_control,
-                                     src.orig_topleft,
-                                     src.stride,
-                                     block_width,
-                                     block_height,
-                                     lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                     LCU_WIDTH,
-                                     mv_frac_x,
-                                     mv_frac_y,
-                                     mv_param);
-
-  if (src.malloc_used) free(src.buffer);
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
-                                        const kvz_picture * const ref,
-                                        int32_t xpos,
-                                        int32_t ypos,
-                                        int32_t block_width,
-                                        int32_t block_height,
-                                        const int16_t mv_param[2],
-                                        hi_prec_buf_t *hi_prec_out)
+static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
  int mv_frac_x = (mv_param[0] & 3);
  int mv_frac_y = (mv_param[1] & 3);

-  // Fractional luma 1/4-pel
-  kvz_extended_block src = { 0, 0, 0, 0 };
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
+  };

-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
-  kvz_sample_14bit_quarterpel_luma(state->encoder_control,
-                                           src.orig_topleft,
-                                           src.stride,
-                                           block_width,
-                                           block_height,
-                                           hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                           LCU_WIDTH,
-                                           mv_frac_x,
-                                           mv_frac_y,
-                                           mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  if (src.malloc_used) free(src.buffer);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_quarterpel_luma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_frac_chroma(const encoder_state_t * const state,
-                                    const kvz_picture * const ref,
-                                    int32_t xpos,
-                                    int32_t ypos,
-                                    int32_t block_width,
-                                    int32_t block_height,
-                                    const int16_t mv_param[2],
-                                    lcu_t *lcu)
+static void inter_recon_frac_chroma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
  int mv_frac_x = (mv_param[0] & 7);
  int mv_frac_y = (mv_param[1] & 7);

-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;

-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
+  };

-  //Fractional chroma U
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
-  kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
-    block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  //Fractional chroma V
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
-  kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
-    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);

-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
-                                          const kvz_picture * const ref,
-                                          int32_t xpos,
-                                          int32_t ypos,
-                                          int32_t block_width,
-                                          int32_t block_height,
-                                          const int16_t mv_param[2],
-                                          hi_prec_buf_t *hi_prec_out)
+static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
  int mv_frac_x = (mv_param[0] & 7);
  int mv_frac_y = (mv_param[1] & 7);

-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;

-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
+  };

-  //Fractional chroma U
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_u.orig_topleft,
-                                         src_u.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  //Fractional chroma V
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_v.orig_topleft,
-                                         src_v.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);

-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }


@ -348,7 +382,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
    if (fractional_luma) {
      // With a fractional MV, do interpolation.
      if (state->encoder_control->cfg.bipred && hi_prec_out) {
-        inter_recon_14bit_frac_luma(state, ref,
+        inter_recon_frac_luma_hi(state, ref,
          pu_in_tile.x, pu_in_tile.y,
          width, height,
          mv_param, hi_prec_out);
@ -386,7 +420,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
  if (fractional_luma || fractional_chroma) {
    // With a fractional MV, do interpolation.
    if (state->encoder_control->cfg.bipred && hi_prec_out) {
-      inter_recon_14bit_frac_chroma(state, ref,
+      inter_recon_frac_chroma_hi(state, ref,
                                    pu_in_tile.x, pu_in_tile.y,
                                    width, height,
                                    mv_param, hi_prec_out);
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -27,6 +27,7 @@
 */

 #include <stdint.h>
+#include <stdio.h>


 #ifdef __cplusplus
@ -473,6 +474,20 @@ typedef struct kvz_config
  enum kvz_file_format file_format;

  char *stats_file_prefix;
+  char *fast_coeff_table_fn;   /*!< \brief Pointer to fast coeff table filename */
+
+  /** \brief whether we're sampling TBs and their costs for fast cost
+   *         estimation training */
+  uint8_t rdo_cost_sampling_mode_on;
+
+  /** \brief whether we're running in normal mode, sampling TBs and their cost
+   *         for fast estimation training, or comparing estimator accuracy to
+   *         CABAC */
+  uint8_t fastrd_sampling_on;
+  uint8_t fastrd_accuracy_check_on;
+
+  char *fastrd_learning_outdir_fn;
+

  struct param_set_map *param_set_map;

--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -803,9 +803,10 @@ static double qp_to_lambda(encoder_state_t* const state, int qp)
    int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
    int aq_offset = round(state->frame->aq_offsets[id]);
    state->qp += aq_offset;
-    // Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Clipping range is a function of bit depth
    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
-    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
    state->qp = CLIP_TO_QP(state->qp);
    state->lambda = qp_to_lambda(state, state->qp);
    state->lambda_sqrt = sqrt(state->lambda);
@ -1149,9 +1150,10 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
    int id = lcu_pos.x + lcu_pos.y * state->tile->frame->width_in_lcu;
    int aq_offset = round(state->frame->aq_offsets[id]);
    state->qp += aq_offset;    
-    // Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Maximum delta QP is clipped according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Clipping range is a function of bit depth
    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
-    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP(state->frame->QP + KVZ_QP_DELTA_MIN / 2, state->frame->QP + KVZ_QP_DELTA_MAX / 2, state->qp);
    state->qp = CLIP_TO_QP(state->qp);
    state->lambda = qp_to_lambda(state, state->qp);
    state->lambda_sqrt = sqrt(state->lambda);
--- a/src/rdo.c
+++ b/src/rdo.c
@ -20,8 +20,10 @@

 #include "rdo.h"

+#include <errno.h>
 #include <stdlib.h>
 #include <string.h>
+#include <pthread.h>

 #include "cabac.h"
 #include "context.h"
@ -43,6 +45,11 @@
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4

+#define RD_SAMPLING_MAX_LAST_QP     50
+
+static FILE *fastrd_learning_outfile[RD_SAMPLING_MAX_LAST_QP + 1] = {NULL};
+static pthread_mutex_t outfile_mutex[RD_SAMPLING_MAX_LAST_QP + 1];
+
 const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
 const uint32_t g_auiGoRiceParsCoeff[32] =
@ -152,6 +159,67 @@ struct sh_rates_t {
  int32_t quant_delta[32 * 32];
 };

+int kvz_init_rdcost_outfiles(const char *dir_path)
+{
+#define RD_SAMPLING_MAX_FN_LENGTH 4095
+  static const char *basename_tmpl = "/%02i.txt";
+  char fn_template[RD_SAMPLING_MAX_FN_LENGTH + 1];
+  char fn[RD_SAMPLING_MAX_FN_LENGTH + 1];
+  int rv = 0, qp;
+
+  // As long as QP is a two-digit number, template and produced string should
+  // be equal in length ("%i" -> "22")
+  assert(RD_SAMPLING_MAX_LAST_QP <= 99);
+  assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
+
+  strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
+  strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
+
+  for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
+    pthread_mutex_t *curr = outfile_mutex + qp;
+
+    if (pthread_mutex_init(curr, NULL) != 0) {
+      fprintf(stderr, "Failed to create mutex\n");
+      rv = -1;
+      qp--;
+      goto out_destroy_mutexes;
+    }
+  }
+
+  for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
+    FILE *curr;
+
+    snprintf(fn, RD_SAMPLING_MAX_FN_LENGTH, fn_template, qp);
+    fn[RD_SAMPLING_MAX_FN_LENGTH] = 0;
+    curr = fopen(fn, "w");
+    if (curr == NULL) {
+      fprintf(stderr, "Failed to open %s: %s\n", fn, strerror(errno));
+      rv = -1;
+      qp--;
+      goto out_close_files;
+    }
+    fastrd_learning_outfile[qp] = curr;
+  }
+  goto out;
+
+out_close_files:
+  for (; qp >= 0; qp--) {
+    fclose(fastrd_learning_outfile[qp]);
+    fastrd_learning_outfile[qp] = NULL;
+  }
+  goto out;
+
+out_destroy_mutexes:
+  for (; qp >= 0; qp--) {
+    pthread_mutex_destroy(outfile_mutex + qp);
+  }
+  goto out;
+
+out:
+  return rv;
+#undef RD_SAMPLING_MAX_FN_LENGTH
+}
+

 /**
 * \brief Calculate actual (or really close to actual) bitcost for coding
@ -205,6 +273,33 @@ static INLINE uint32_t get_coeff_cabac_cost(
  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
 }

+static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
+{
+  pthread_mutex_t *mtx = outfile_mutex + qp;
+
+  assert(sizeof(coeff_t) == sizeof(int16_t));
+  assert(qp <= RD_SAMPLING_MAX_LAST_QP);
+
+  pthread_mutex_lock(mtx);
+
+  fwrite(&size,  sizeof(size),     1,    fastrd_learning_outfile[qp]);
+  fwrite(&ccc,   sizeof(ccc),      1,    fastrd_learning_outfile[qp]);
+  fwrite( coeff, sizeof(coeff_t),  size, fastrd_learning_outfile[qp]);
+
+  pthread_mutex_unlock(mtx);
+}
+
+static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
+{
+  pthread_mutex_t *mtx = outfile_mutex + qp;
+
+  assert(qp <= RD_SAMPLING_MAX_LAST_QP);
+
+  pthread_mutex_lock(mtx);
+  fprintf(fastrd_learning_outfile[qp], "%u %u\n", fast_cost, ccc);
+  pthread_mutex_unlock(mtx);
+}
+
 /**
 * \brief Estimate bitcost for coding coefficients.
 *
@ -220,14 +315,32 @@ uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
                            int32_t type,
                            int8_t scan_mode)
 {
-  if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) {
-    return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+  uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
+  uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;

+  if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
+      state->qp < MAX_FAST_COEFF_COST_QP) {
+    // TODO: do we need to assert(0) out of the fast-estimation branch if we
+    // are to save block costs, or should we just warn about it somewhere
+    // earlier (configuration validation I guess)?
+    if (save_cccs) {
+      assert(0 && "Fast RD sampling does not work with fast-residual-cost");
+      return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
+    } else {
+      uint64_t weights = kvz_fast_coeff_get_weights(state);
+      uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
+      if (check_accuracy) {
+        uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+        save_accuracy(state->qp, ccc, fast_cost);
+      }
+      return fast_cost;
+    }
  } else {
-    // Estimate coeff coding cost based on QP and sum of absolute coeffs.
-    // const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width);
-    // return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5);
-    return kvz_fast_coeff_cost(coeff, width, state->qp);
+    uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+    if (save_cccs) {
+      save_ccc(state->qp, coeff, width * width, ccc);
+    }
+    return ccc;
  }
 }

@ -1192,3 +1305,18 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
 }

+void kvz_close_rdcost_outfiles(void)
+{
+  int i;
+
+  for (i = 0; i < RD_SAMPLING_MAX_LAST_QP; i++) {
+    FILE *curr = fastrd_learning_outfile[i];
+    pthread_mutex_t *curr_mtx = outfile_mutex + i;
+    if (curr != NULL) {
+      fclose(curr);
+    }
+    if (curr_mtx != NULL) {
+      pthread_mutex_destroy(curr_mtx);
+    }
+  }
+}
--- a/src/rdo.h
+++ b/src/rdo.h
@ -36,6 +36,9 @@
 extern const uint32_t kvz_g_go_rice_range[5];
 extern const uint32_t kvz_g_go_rice_prefix_len[5];

+int kvz_init_rdcost_outfiles(const char *fn_template);
+void kvz_close_rdcost_outfiles(void);
+
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
           int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth, uint16_t cbf);

--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -992,12 +992,11 @@ static void search_frac(inter_search_info_t *info)

  unsigned costs[4] = { 0 };

-  kvz_extended_block src = { 0, 0, 0, 0 };
-  ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
+  ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];

  // Storage buffers for intermediate horizontally filtered results.
  // Have the first columns in contiguous memory for vectorization.
-  ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];

  const kvz_picture *ref = info->ref;
@ -1013,20 +1012,45 @@ static void search_frac(inter_search_info_t *info)
  int8_t sample_off_x = 0;
  int8_t sample_off_y = 0;

-  kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
-                state->tile->offset_x,
-                state->tile->offset_y,
-                ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
-                internal_width+1, internal_height+1,
-                &src);
+  // Space for (possibly) extrapolated pixels and the part from the picture
+  // One extra row and column compared to normal interpolation and some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + orig.x + mv.x - 1,
+    .blk_y = state->tile->offset_y + orig.y + mv.y - 1,
+    .blk_w = internal_width + 1,  // TODO: real width
+    .blk_h = internal_height + 1, // TODO: real height
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
+  };
+
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
+  kvz_get_extended_block(&epol_args);

  kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
  int tmp_stride = pic->stride;
                  
  // Search integer position
  costs[0] = kvz_satd_any_size(width, height,
-                            tmp_pic, tmp_stride,
-                            src.orig_topleft + src.stride + 1, src.stride);
+    tmp_pic, tmp_stride,
+    ext_origin + ext_s + 1, ext_s);

  costs[0] += info->mvd_cost_func(state,
                                  mv.x, mv.y, 2,
@ -1056,8 +1080,8 @@ static void search_frac(inter_search_info_t *info)
    const int mv_shift = (step < 2) ? 1 : 0;

    filter_steps[step](state->encoder_control,
-      src.orig_topleft,
-      src.stride,
+      ext_origin,
+      ext_s,
      internal_width,
      internal_height,
      filtered,
@ -1131,8 +1155,6 @@ static void search_frac(inter_search_info_t *info)
  info->best_mv = mv;
  info->best_cost = best_cost;
  info->best_bitcost = best_bitcost;
-
-  if (src.malloc_used) free(src.buffer);
 }

 /**
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -40,6 +40,7 @@
 #include "strategyselector.h"
 #include "tables.h"
 #include "transform.h"
+#include "fast_coeff_cost.h"

 static INLINE int32_t hsum32_8x32i(__m256i src)
 {
@ -814,81 +815,63 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
  return parts[0] + parts[1] + parts[2] + parts[3];
 }

-#define TO_Q88(f) ((int16_t)((f) * 256.0f))
-
-static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t qp)
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights)
 {
-#define NUM_BUCKETS 5
-  static const int16_t wt_m[NUM_BUCKETS] = {
-    TO_Q88(-0.004916),
-    TO_Q88( 0.010806),
-    TO_Q88( 0.055562),
-    TO_Q88( 0.033436),
-    TO_Q88(-0.007690),
-  };
-  static const int16_t wt_c[NUM_BUCKETS] = {
-    TO_Q88( 0.172024),
-    TO_Q88( 3.421462),
-    TO_Q88( 2.879506),
-    TO_Q88( 5.585471),
-    TO_Q88( 0.256772),
-  };
+  const __m256i zero           = _mm256_setzero_si256();
+  const __m256i threes         = _mm256_set1_epi16(3);
+  const __m256i negate_hibytes = _mm256_set1_epi16(0xff00);
+  const __m128i wt_extract_los = _mm_cvtsi32_si128(0x06040200);
+  const __m128i wt_extract_his = _mm_cvtsi32_si128(0x07050301);

-  const __m256i zero   = _mm256_setzero_si256();
-  const __m256i threes = _mm256_set1_epi16(3);
-  const __m256i ones   = _mm256_srli_epi16(threes, 1);
-  const __m256i twos   = _mm256_slli_epi16(ones,   1);
+  __m256i lo_sum     = _mm256_setzero_si256();
+  __m256i hi_sum     = _mm256_setzero_si256();

-  __m256i wt[NUM_BUCKETS - 1];
-  for (int32_t i = 0; i < NUM_BUCKETS - 1; i++)
-    wt[i] = _mm256_set1_epi16(wt_m[i] * qp + wt_c[i]);
+  __m128i wts_128    = _mm_loadl_epi64 ((const __m128i *)&weights);
+  __m128i wts_lo_128 = _mm_shuffle_epi8(wts_128, wt_extract_los);
+  __m128i wts_hi_128 = _mm_shuffle_epi8(wts_128, wt_extract_his);

-  uint32_t wid_wt = width * (wt_m[NUM_BUCKETS - 1] * qp + wt_c[NUM_BUCKETS - 1]);
-  __m256i avx_inc = _mm256_setzero_si256();
+  __m256i wts_lo     = _mm256_broadcastsi128_si256(wts_lo_128);
+  __m256i wts_hi     = _mm256_broadcastsi128_si256(wts_hi_128);

-  for (int32_t i = 0; i < width * width; i += 16) {
-    __m256i curr      = _mm256_loadu_si256((__m256i *)(coeff + i));
-    __m256i curr_abs  = _mm256_abs_epi16  (curr);
-    __m256i curr_max3 = _mm256_min_epi16  (curr_abs, threes);
+  for (int i = 0; i < width * width; i += 32) {
+    __m256i curr_lo      = _mm256_loadu_si256 ((const __m256i *)(coeff + i));
+    __m256i curr_abs_lo  = _mm256_abs_epi16   (curr_lo);
+    __m256i curr_max3_lo = _mm256_min_epu16   (curr_abs_lo, threes);

-    __m256i curr_eq_0 = _mm256_cmpeq_epi16(curr_max3, zero);
-    __m256i curr_eq_1 = _mm256_cmpeq_epi16(curr_max3, ones);
-    __m256i curr_eq_2 = _mm256_cmpeq_epi16(curr_max3, twos);
-    __m256i curr_eq_3 = _mm256_cmpeq_epi16(curr_max3, threes);
+    // 4x4 blocks only have 16 coeffs, so handle them separately
+    __m256i curr_max3_hi;
+    if (width >= 8) {
+      __m256i curr_hi      = _mm256_loadu_si256 ((const __m256i *)(coeff + i + 16));
+      __m256i curr_abs_hi  = _mm256_abs_epi16   (curr_hi);
+              curr_max3_hi = _mm256_min_epu16   (curr_abs_hi, threes);
+              curr_max3_hi = _mm256_slli_epi16  (curr_max3_hi, 8);
+    } else {
+      // Set MSBs for high bytes if they're meaningless, so shuffles will
+      // return zeros for them
+      curr_max3_hi = negate_hibytes;
+    }
+    __m256i curr_max3    = _mm256_or_si256    (curr_max3_lo, curr_max3_hi);
+    __m256i curr_wts_lo  = _mm256_shuffle_epi8(wts_lo, curr_max3);
+    __m256i curr_wts_hi  = _mm256_shuffle_epi8(wts_hi, curr_max3);

-    __m256i curr_0_wt = _mm256_and_si256  (curr_eq_0, wt[0]);
-    __m256i curr_1_wt = _mm256_and_si256  (curr_eq_1, wt[1]);
-    __m256i curr_2_wt = _mm256_and_si256  (curr_eq_2, wt[2]);
-    __m256i curr_3_wt = _mm256_and_si256  (curr_eq_3, wt[3]);
+    __m256i curr_sum_lo  = _mm256_sad_epu8    (curr_wts_lo, zero);
+    __m256i curr_sum_hi  = _mm256_sad_epu8    (curr_wts_hi, zero);

-    // Use madd to horizontally sum 16-bit weights into 32-bit atoms
-    __m256i wt_0_32b  = _mm256_madd_epi16(curr_0_wt, ones);
-    __m256i wt_1_32b  = _mm256_madd_epi16(curr_1_wt, ones);
-    __m256i wt_2_32b  = _mm256_madd_epi16(curr_2_wt, ones);
-    __m256i wt_3_32b  = _mm256_madd_epi16(curr_3_wt, ones);
-
-    __m256i wt_01     = _mm256_add_epi32(wt_0_32b, wt_1_32b);
-    __m256i wt_23     = _mm256_add_epi32(wt_2_32b, wt_3_32b);
-    __m256i curr_wts  = _mm256_add_epi32(wt_01,    wt_23);
-    avx_inc           = _mm256_add_epi32(avx_inc,  curr_wts);
+            lo_sum       = _mm256_add_epi64   (lo_sum, curr_sum_lo);
+            hi_sum       = _mm256_add_epi64   (hi_sum, curr_sum_hi);
  }
-  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
-  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+          hi_sum = _mm256_slli_epi64(hi_sum, 8);
+  __m256i sum0   = _mm256_add_epi64(lo_sum, hi_sum);

-  __m128i sum_1 = _mm_add_epi32    (inclo, inchi);
-  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
-  __m128i sum_3 = _mm_add_epi32    (sum_1, sum_2);
-  __m128i sum_4 = _mm_shuffle_epi32(sum_3, _MM_SHUFFLE(2, 3, 0, 1));
-  __m128i sum   = _mm_add_epi32    (sum_3, sum_4);
+  __m256i sum1   = _mm256_permute4x64_epi64(sum0, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum2   = _mm256_add_epi64        (sum0, sum1);
+  __m256i sum3   = _mm256_shuffle_epi32    (sum2, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum4   = _mm256_add_epi64        (sum2, sum3);

-  uint32_t sum_u32 = _mm_cvtsi128_si32(sum);
-  uint32_t sum_total = sum_u32 + wid_wt;
-  return sum_total >> 8;
-#undef NUM_BUCKETS
+  __m128i sum128 = _mm256_castsi256_si128  (sum4);
+  return (_mm_cvtsi128_si32(sum128) + (1 << 7)) >> 8;
 }

-#undef TO_Q88
-
 #endif //COMPILE_INTEL_AVX2 && defined X86_64

 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
  }
 }

-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
  //TODO: horizontal and vertical only filtering
  int32_t x, y;
@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
  }
 }

-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
  //TODO: horizontal and vertical only filtering
  int32_t x, y;
@ -728,59 +728,55 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
 }


-void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out) {
+void kvz_get_extended_block_generic(kvz_epol_args *args) {

-  int half_filter_size = filter_size >> 1;
+  int min_y = args->blk_y - args->pad_t;
+  int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
+  bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);

-  out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
-  out->stride = ref_width;
-  out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-  out->malloc_used = 0;
+  int min_x = args->blk_x - args->pad_l;
+  int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
+  bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);

-  int min_y = ypos - half_filter_size + off_y + mv_y;
-  int max_y = min_y + height + filter_size;
-  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
+  if (out_of_bounds_y || out_of_bounds_x) {

-  int min_x = xpos - half_filter_size + off_x + mv_x;
-  int max_x = min_x + width + filter_size;
-  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
+    *args->ext = args->buf;
+    *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
+    *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;

-  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
+    // Note that stride equals width here.
+    int cnt_l = CLIP(0, *args->ext_s, -min_x);
+    int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
+    int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);

-  if (sample_out_of_bounds){
-    out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
-    if (!out->buffer){
-      fprintf(stderr, "Memory allocation failed!\n");
-      assert(0);
+    // For each row including real padding.
+    // Don't read "don't care" values (SIMD padding). Zero them out.
+    int y;
+    for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
+
+      int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
+      kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
+      kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
+      kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
+      kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
+      kvz_pixel *dst_m = dst_l + cnt_l;
+      kvz_pixel *dst_r = dst_m + cnt_m;
+      for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
+      for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
+      for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
    }
-    out->stride = width + filter_size;
-    out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-    out->malloc_used = 1;

-    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-
-    for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
-
-      // calculate y-pixel offset
-      coord_y = y + off_y + mv_y;
-      coord_y = CLIP(0, (ref_height)-1, coord_y);
-      coord_y *= ref_width;
-
-      if (!out_of_bounds_x){
-        memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
-      } else {
-        for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
-
-          coord_x = x + off_x + mv_x;
-          coord_x = CLIP(0, (ref_width)-1, coord_x);
-
-          // Store source block data (with extended borders)
-          out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
-        }
-      }
+    for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
+      kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
+      FILL_ARRAY(dst, 0, *args->ext_s);
    }
-  } 
+
+  } else {
+
+    *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
+    *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
+    *args->ext_s = args->src_s;
+  }
 }

 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
@ -793,8 +789,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
  success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
  success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);

  return success;
--- a/src/strategies/generic/ipol-generic.h
+++ b/src/strategies/generic/ipol-generic.h
@ -32,9 +32,9 @@

 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);


 #endif //STRATEGIES_IPOL_GENERIC_H_
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -29,6 +29,7 @@
 #include "strategies/strategies-quant.h"
 #include "strategyselector.h"
 #include "transform.h"
+#include "fast_coeff_cost.h"

 #define QUANT_SHIFT 14
 /**
@ -342,46 +343,30 @@ static uint32_t coeff_abs_sum_generic(const coeff_t *coeffs, size_t length)
  return sum;
 }

-static INLINE int16_t to_q88(float f)
+static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights)
 {
-  return (int16_t)(f * 256.0f);
+  weights[0] = (wts_packed >>  0) & 0xffff;
+  weights[1] = (wts_packed >> 16) & 0xffff;
+  weights[2] = (wts_packed >> 32) & 0xffff;
+  weights[3] = (wts_packed >> 48) & 0xffff;
 }

-static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp)
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights)
 {
  uint32_t sum = 0;
-#define NUM_BUCKETS 5
-  const int16_t wt_m[NUM_BUCKETS] = {
-    to_q88(-0.004916),
-    to_q88(0.010806),
-    to_q88(0.055562),
-    to_q88(0.033436),
-    to_q88(-0.007690),
-  };
-  const int16_t wt_c[NUM_BUCKETS] = {
-    to_q88(0.172024),
-    to_q88(3.421462),
-    to_q88(2.879506),
-    to_q88(5.585471),
-    to_q88(0.256772),
-  };
+  uint16_t weights_unpacked[4];

-  int16_t wt[NUM_BUCKETS];
-  for (int32_t i = 0; i < NUM_BUCKETS; i++)
-    wt[i] = wt_m[i] * qp + wt_c[i];
+  get_coeff_weights(weights, weights_unpacked);

  for (int32_t i = 0; i < width * width; i++) {
-    int16_t curr = coeff[i];
-    int16_t signmask = curr >> 15;
-    int16_t curr_abs = (curr ^ signmask) - signmask;
-    if (curr_abs > 3)
+     int16_t curr = coeff[i];
+    uint32_t curr_abs = abs(curr);
+    if (curr_abs > 3) {
      curr_abs = 3;
-
-    sum += wt[curr_abs];
+    }
+    sum += weights_unpacked[curr_abs];
  }
-  sum += wt[NUM_BUCKETS - 1] * width;
-  return sum >> 8;
-#undef NUM_BUCKETS
+  return (sum + (1 << 7)) >> 8;
 }

 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
--- a/src/strategies/strategies-ipol.c
+++ b/src/strategies/strategies-ipol.c
@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 epol_func *kvz_get_extended_block;
 kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;


 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@ -31,21 +31,63 @@
 #include "kvazaar.h"
 #include "search_inter.h"

+// AVX2 implementation of horizontal filter reads and
+// writes two rows for luma and four for chroma at a time.
+// Extra vertical padding is added to prevent segfaults.
+// Horizontal padding is not needed even if one extra byte
+// is read because kvz_image_alloc adds enough padding.
+#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
+#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
+#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
+#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
+
+// On top of basic interpolation, FME needs one extra
+// column and row for ME (left and up). Adding the
+// extra row happens to satisfy AVX2 requirements for
+// row count. No other extra rows are needed.
+#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))

 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;

 typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], 
+  kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t sample_off_x, int8_t sample_off_y);

-typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out);
+typedef struct {
+  // Source samples
+  kvz_pixel *src; // Top-left sample
+  int src_w; // Width
+  int src_h; // Height
+  int src_s; // Stride
+
+  // Requested sampling position, base dimensions, and padding
+  int blk_x;
+  int blk_y;
+  int blk_w; // Width
+  int blk_h; // Height
+  int pad_l; // Left
+  int pad_r; // Right
+  int pad_t; // Top
+  int pad_b; // Bottom
+  int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
+
+  // Buffer for possible extrapolation. Free memory provided by the caller.
+  kvz_pixel *buf;
+
+  // Extended block data. These are set by the function.
+  kvz_pixel **ext; // Top-left sample with padding
+  kvz_pixel **ext_origin; // Top-left sample without padding
+  int *ext_s; // Stride
+} kvz_epol_args;
+
+typedef void(epol_func)(kvz_epol_args *args);
+

 typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

-typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

 // Declare function pointers.
 extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
@ -55,8 +97,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 extern epol_func * kvz_get_extended_block;
 extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;


 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@ -69,8 +111,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
  {"filter_qpel_blocks_diag_luma",    (void**) &kvz_filter_qpel_blocks_diag_luma}, \
  {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
  {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
-  {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
-  {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
+  {"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
+  {"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
  {"get_extended_block", (void**) &kvz_get_extended_block}, \


--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@ -32,7 +32,6 @@
 #include "kvazaar.h"
 #include "tables.h"

-
 // Declare function pointers.
 typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
@ -45,7 +44,7 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
  bool early_skip);
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
  int32_t height, int8_t type, int8_t block_type);
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t qp);
+typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);

 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

--- a/src/transform.h
+++ b/src/transform.h
@ -31,7 +31,6 @@
 #include "encoderstate.h"
 #include "global.h" // IWYU pragma: keep

-
 extern const uint8_t kvz_g_chroma_scale[58];
 extern const int16_t kvz_g_inv_quant_scales[6];

--- a/tests/tsan_suppressions.txt
+++ b/tests/tsan_suppressions.txt
@ -1,3 +1,4 @@
-race:kvz_eight_tap_filter_hor_8x1_avx2
+# AVX2 interpolation reads some extra pixels
+race:kvz_ipol_8tap_hor_px_im_avx2
 race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
-race:kvz_eight_tap_filter_hor_avx2
+race:kvz_eight_tap_filter_hor_avx2