Tool to extract RDO bitrates

2024-11-23 18:14:06 +00:00 · 2019-03-18 15:35:03 +02:00 · 2019-03-18 15:35:03 +02:00 · 33dd9c95cd
parent e833354cdd
commit 33dd9c95cd
8 changed files with 440 additions and 4 deletions
--- a/src/README-rdcost-thingy.txt
+++ b/src/README-rdcost-thingy.txt
@ -0,0 +1,33 @@
+Build Kvazaar as usual with make, then edit extract_rdcosts.py so that the
+parameters suit your usage (the directories, num of threads and Kvazaar
+params) and then run extract_rdcosts.py. It will run a lot of Kvazaar
+instances in parallel to encode a lot of videos and sift off all the coeff
+groups they measure RD cost for. The coeff groups will be written into the
+relevant data file in the following format (although through GZIP):
+
+Size (B)  | Description
+----------+------------
+4         | size:   Coeff group size, in int16's
+4         | ccc:    Coeff group's coding cost
+size * 2  | coeffs: Coeff group data
+
+You can roll your own filter_rdcosts.c program to analyze the data the way
+you want, and run it like:
+
+$ gzip -d < /path/to/compressed_datafile.gz | ./filter_rdcosts | less
+
+Maybe one day, there'll be a multithreaded script like extract_rdcosts.py to
+automate and parallelize processing of a massive heap of data files.
+
+EDIT:
+It's now possible to do OLS regression by streaming the source data twice
+from source and using Octave to invert the temporary result matrix, and
+that's what run_filter.py does in parallel. To do this on data you've
+gathered by extract_rdcosts.py:
+
+$ gcc filter_rdcosts.c -o frcosts_matrix
+$ gcc ols_2ndpart.c -o ols_2ndpart
+$ ./run_filter.py
+
+Although you should probably adjust the run_filter.py params before actually
+running it
--- a/src/estimate.m
+++ b/src/estimate.m
@ -0,0 +1,5 @@
+data = dlmread("/dev/stdin", " ");
+coeffs = data(1:end, 1:5);
+costs = data(1:end, 6);
+[beta, sigma, r] = ols(costs, coeffs);
+disp(beta)
--- a/src/extract_rdcosts.py
+++ b/src/extract_rdcosts.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import subprocess
+import threading
+import time
+
+logdir = os.path.join("/tmp", "rdcost", "logs")
+ofdir  = os.path.join("/tmp", "rdcost", "data")
+
+n_threads   = 8
+home_rel    = lambda path: os.path.join(os.environ["HOME"], path)
+
+qps         = range(12, 14)
+sequences   = ("/opt/test_seqs/hevc-B/*.yuv",)
+
+kvzargs     = [home_rel("kvazaar/src/kvazaar"), "--threads", "1", "--preset=ultrafast"]
+kvzenv      = {"LD_LIBRARY_PATH": home_rel("kvazaar/src/.libs/")}
+
+gzargs      = ["gzip"]
+
+class MTSafeIterable:
+    def __init__(self, iterable):
+        self.lock = threading.Lock()
+        self.iterable = iterable
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            return next(self.iterable)
+
+def combinations(xi, yi):
+    for x in xi:
+        for y in yi:
+            yield (x, y)
+
+def chain(lol):
+    for l in lol:
+        for i in l:
+            yield i
+
+def run_job(job):
+    ifpath, qp = job
+    ifname = os.path.basename(ifpath)
+
+    jobname  = "%s-qp%i" % (ifname, qp)
+    hevcname = "%s.hevc" % jobname
+    logname  = "%s.log"  % jobname
+    ofname   = "%s.gz"   % jobname
+
+    hevcpath = os.path.join("/tmp", hevcname)
+    logpath  = os.path.join(logdir, logname)
+    ofpath   = os.path.join(ofdir,  ofname)
+
+    my_kvzargs = kvzargs + ["-i",   ifpath,
+                            "--qp", str(qp),
+                            "-o",   hevcpath]
+
+    with open(logpath, "w") as lf:
+        with open(ofpath, "wb") as of:
+            kvz = subprocess.Popen(my_kvzargs, env=kvzenv, stdout=subprocess.PIPE, stderr=lf)
+            gzip = subprocess.Popen(gzargs, stdin=kvz.stdout, stdout=of)
+
+            gzip.communicate()
+            kvz.communicate()
+
+def threadfunc(joblist):
+    for job in joblist:
+        run_job(job)
+
+def main():
+    jobs = combinations(chain(map(glob.glob, sequences)), qps)
+    joblist = MTSafeIterable(jobs)
+
+    threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_threads)]
+    for thread in threads:
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if (__name__ == "__main__"):
+    main()
--- a/src/filter_rdcosts.c
+++ b/src/filter_rdcosts.c
@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define BUFSZ (64 * 64 * sizeof(uint16_t))
+#define NUM_COEFF_BUCKETS (4)
+#define NUM_OTHER_BUCKETS (0)
+#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
+#define MAX_COEFF_BUCKET  ((NUM_COEFF_BUCKETS) - 1)
+
+#define clz(x) __builtin_clz(x)
+#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
+
+void print_coeffs(const int16_t *buf, uint32_t size, uint32_t ccc)
+{
+  uint32_t i;
+  printf("Buf size %u, ccc %u\n", size, ccc);
+  for (i = 0; i < size; i++)
+    printf("%i ", buf[i]);
+  printf("\n");
+}
+
+void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs, uint16_t *excess)
+{
+  *excess = 0;
+  uint32_t i;
+
+  for (i = 0; i < size; i++) {
+    int16_t curr = buf[i];
+    int16_t is_signed = curr >> 15;
+    *num_signs += (is_signed & 1);
+
+    uint16_t abs = (curr ^ is_signed) - is_signed;
+    if (abs > MAX_COEFF_BUCKET) {
+      *excess += abs - MAX_COEFF_BUCKET;
+      abs = MAX_COEFF_BUCKET;
+    }
+
+    buckets[abs]++;
+  }
+}
+
+void print_buckets(const uint64_t *buckets, uint64_t num_signs)
+{
+  uint32_t i;
+  for (i = 0; i < NUM_COEFF_BUCKETS; i++)
+    printf("%3u: %lu\n", i, buckets[i]);
+  printf("Signs: %lu\n", num_signs);
+}
+
+void update_matrix(const uint64_t *buckets, uint64_t *mat)
+{
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      int curr_pos = y * NUM_TOTAL_BUCKETS + x;
+      mat[curr_pos] += buckets[x] * buckets[y];
+    }
+  }
+}
+
+void process_rdcosts(FILE *in, FILE *out)
+{
+  void *buf = malloc(BUFSZ);
+  uint32_t *u32buf = (uint32_t *)buf;
+  int16_t  *i16buf = (int16_t  *)buf;
+
+  float weights[NUM_TOTAL_BUCKETS] = {0.0f};
+
+  uint64_t mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0};
+
+  while (!feof(in)) {
+    uint32_t size, ccc, size_sqrt;
+    uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
+    uint64_t cg_num_signs = 0;
+    uint16_t excess = 0;
+
+    fread(buf, sizeof(uint32_t), 2, in);
+    size = u32buf[0];
+    ccc  = u32buf[1];
+
+    size_sqrt = 1 << (ilog2(size) >> 1);
+
+    fread(buf, sizeof(int16_t), size, in);
+
+    count_coeffs(i16buf, size, cg_buckets, &cg_num_signs, &excess);
+    update_matrix(cg_buckets, mat);
+  }
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      int curr_pos = y * NUM_TOTAL_BUCKETS + x;
+      printf("%lu ", mat[curr_pos]);
+    }
+    printf("\n");
+  }
+
+
+  free(buf);
+}
+
+int main(int ar, char **av)
+{
+  process_rdcosts(stdin, stdout);
+}
--- a/src/invert_matrix.m
+++ b/src/invert_matrix.m
@ -0,0 +1,3 @@
+A = dlmread("/dev/stdin");
+B = inv(A);
+dlmwrite("/dev/stdout", B, " ");
--- a/src/ols_2ndpart.c
+++ b/src/ols_2ndpart.c
@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#define BUFSZ (64 * 64 * sizeof(uint16_t))
+#define NUM_COEFF_BUCKETS (4)
+#define NUM_OTHER_BUCKETS (0)
+#define NUM_TOTAL_BUCKETS ((NUM_COEFF_BUCKETS) + (NUM_OTHER_BUCKETS))
+#ifdef ERR_SQUARED
+#define STEPSIZE (0.00000001f * 0.000001f)
+#else
+#define STEPSIZE (0.00000001f)
+#endif
+
+#define clz(x) __builtin_clz(x)
+#define ilog2(x) (sizeof(x) * 8 - clz(x) - 1)
+#define coord(x,y,w) ((x)+((y)*(w)))
+
+void update_result(const uint64_t *buckets, uint64_t ccc, const double *mat, double *res)
+{
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    double addend = 0.0;
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      addend += mat[coord(x, y, NUM_TOTAL_BUCKETS)] * (double)buckets[x];
+    }
+    addend *= (double)ccc;
+    res[y] += addend;
+  }
+}
+
+void read_matrix(const char *fn, double *mat)
+{
+  FILE *f = fopen(fn, "r");
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++) {
+    for (int x = 0; x < NUM_TOTAL_BUCKETS; x++) {
+      float curr;
+      fscanf(f, "%f", &curr);
+      mat[x + y * NUM_TOTAL_BUCKETS] = curr;
+    }
+  }
+  fclose(f);
+}
+
+void count_coeffs(const int16_t *buf, uint32_t size, uint64_t *buckets, uint64_t *num_signs)
+{
+  uint32_t i;
+  for (i = 0; i < size; i++) {
+    int16_t curr = buf[i];
+    int16_t is_signed = curr >> 15;
+    *num_signs += (is_signed & 1);
+
+    uint16_t abs = (curr ^ is_signed) - is_signed;
+    if (abs >= NUM_COEFF_BUCKETS)
+      abs = NUM_COEFF_BUCKETS - 1;
+
+    buckets[abs]++;
+  }
+}
+
+void process_rdcosts(FILE *in, FILE *out, const double *mat)
+{
+  void *buf = malloc(BUFSZ);
+  uint32_t *u32buf = (uint32_t *)buf;
+  int16_t  *i16buf = (int16_t  *)buf;
+
+  double res[NUM_TOTAL_BUCKETS] = {0.0};
+
+  while (!feof(in)) {
+    uint32_t size, ccc, size_sqrt;
+    uint64_t cg_buckets[NUM_TOTAL_BUCKETS] = {0};
+    uint64_t cg_num_signs = 0;
+
+    fread(buf, sizeof(uint32_t), 2, in);
+    size = u32buf[0];
+    ccc  = u32buf[1];
+
+    size_sqrt = 1 << (ilog2(size) >> 1);
+
+    fread(buf, sizeof(int16_t), size, in);
+
+    count_coeffs(i16buf, size, cg_buckets, &cg_num_signs);
+    update_result(cg_buckets, ccc, mat, res);
+  }
+
+  for (int y = 0; y < NUM_TOTAL_BUCKETS; y++)
+    fprintf(out, "%g\n", (float)(res[y]));
+
+  free(buf);
+}
+
+int main(int ar, char **av)
+{
+  double mat[NUM_TOTAL_BUCKETS * NUM_TOTAL_BUCKETS] = {0.0};
+  if (ar != 2) {
+    fprintf(stderr, "gib matrix plz\n");
+    return 1;
+  }
+  read_matrix(av[1], mat);
+  process_rdcosts(stdin, stdout, mat);
+}
+
--- a/src/rdo.c
+++ b/src/rdo.c
@ -22,6 +22,7 @@

 #include <stdlib.h>
 #include <string.h>
+#include <pthread.h>

 #include "cabac.h"
 #include "context.h"
@ -194,6 +195,26 @@ static INLINE uint32_t get_coeff_cabac_cost(
  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
 }

+static INLINE void save_ccc(const coeff_t *coeff, int32_t size, uint32_t ccc)
+{
+  const uint64_t flush_count = 4096;
+
+  static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+  static uint64_t count = 0;
+  pthread_mutex_lock(&mtx);
+
+  assert(sizeof(coeff_t) == sizeof(int16_t));
+
+  fwrite(&size,  sizeof(size),     1,    stdout);
+  fwrite(&ccc,   sizeof(ccc),      1,    stdout);
+  fwrite( coeff, sizeof(coeff_t),  size, stdout);
+
+  if (((++count) % flush_count) == 0)
+    fflush(stdout);
+
+  pthread_mutex_unlock(&mtx);
+}
+
 /**
 * \brief Estimate bitcost for coding coefficients.
 *
@ -209,13 +230,21 @@ uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
                            int32_t type,
                            int8_t scan_mode)
 {
+  int save_cccs = 1; // TODO!
  if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
      state->qp < MAX_FAST_COEFF_COST_QP) {
-
+    if (save_cccs) {
+      assert(0 && "Plz no fast-residual-cost");
+    } else {
      uint64_t weights = kvz_fast_coeff_get_weights(state);
      return kvz_fast_coeff_cost(coeff, width, weights);
+    }
  } else {
-    return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+    uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+    if (save_cccs) {
+      save_ccc(coeff, width * width, ccc);
+    }
+    return ccc;
  }
 }

--- a/src/run_filter.py
+++ b/src/run_filter.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import subprocess
+import tempfile
+import threading
+import time
+
+n_threads   = 8
+data        = "/home/moptim/rdcost/data/*.gz"
+gzargs      = ["gzip", "-d"]
+filtargs    = ["./frcosts_matrix"]
+octargs     = ["octave-cli", "invert_matrix.m"]
+filt2args   = ["./ols_2ndpart"]
+resultdir   = os.path.join("/tmp", "rdcost", "coeff_buckets")
+
+class MTSafeIterable:
+    def __init__(self, iterable):
+        self.lock = threading.Lock()
+        self.iterable = iterable
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            return next(self.iterable)
+
+def run_job(job):
+    datapath   = job
+    resultpath = os.path.join(resultdir, os.path.basename(job) + ".result")
+
+    print("Running job %s" % datapath)
+
+    with open(resultpath, "w") as rf:
+        with tempfile.NamedTemporaryFile() as tf:
+            with open(datapath, "rb") as df:
+                f2a = list(filt2args)
+                f2a.append(tf.name)
+                gzip = subprocess.Popen(gzargs, stdin=df, stdout=subprocess.PIPE)
+                filt = subprocess.Popen(filtargs, stdin=gzip.stdout, stdout=subprocess.PIPE)
+                octa = subprocess.Popen(octargs, stdin=filt.stdout, stdout=tf)
+
+                octa.communicate()
+                filt.communicate()
+                gzip.communicate()
+
+            with open(datapath, "rb") as df:
+                gz2  = subprocess.Popen(gzargs, stdin=df, stdout=subprocess.PIPE)
+                f2   = subprocess.Popen(f2a, stdin=gz2.stdout, stdout=rf)
+
+                f2.communicate()
+                gz2.communicate()
+
+    print("Job %s done" % datapath)
+
+def threadfunc(joblist):
+    for job in joblist:
+        run_job(job)
+
+def main():
+    jobs = glob.glob(data)
+    joblist = MTSafeIterable(iter(jobs))
+
+    threads = [threading.Thread(target=threadfunc, args=(joblist,)) for _ in range(n_threads)]
+    for thread in threads:
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if (__name__ == "__main__"):
+    main()