Merge branch 'vaq'

2024-11-23 18:14:06 +00:00 · 2020-04-03 19:51:17 +03:00 · 2020-04-03 19:51:17 +03:00 · 901c25c0c8
parent 8e4b738900 51451be5ef
commit 901c25c0c8
20 changed files with 444 additions and 47 deletions
--- a/README.md
+++ b/README.md
@ -164,6 +164,8 @@ Video structure:
      --high-tier            : Used with --level. Use high tier bitrate limits
                               instead of the main tier limits during encoding.
                               High tier requires level 4 or higher.
+      --vaq <integer>        : Enable variance adaptive quantization with given
+                               strength, in range 1..20.

 Compression tools:
      --(no-)deblock <beta:tc> : Deblocking filter. [0:0]
--- a/configure.ac
+++ b/configure.ac
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=1
+ver_minor=2
 ver_release=0

 # Prevents configure from adding a lot of defines to the CFLAGS
--- a/doc/kvazaar.1
+++ b/doc/kvazaar.1
@ -195,6 +195,10 @@ Same as \-\-level but warnings instead of errors.
 Used with \-\-level. Use high tier bitrate limits
 instead of the main tier limits during encoding.
 High tier requires level 4 or higher.
+.TP
+\fB\-\-vaq <integer>       
+Enable variance adaptive quantization with given
+strength, in range 1..20.

 .SS "Compression tools:"
 .TP
--- a/src/cfg.c
+++ b/src/cfg.c
@ -143,6 +143,8 @@ int kvz_config_init(kvz_config *cfg)

  cfg->me_max_steps = (uint32_t)-1;

+  cfg->vaq = 0;
+
  cfg->scaling_list = KVZ_SCALING_LIST_OFF;

  cfg->max_merge = 5;
@ -1305,6 +1307,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
  }
  else if (OPT("fast-residual-cost"))
    cfg->fast_residual_cost_limit = atoi(value);
+	else if (OPT("vaq")) {
+		cfg->vaq = (int)atoi(value);
+	}
  else if (OPT("max-merge")) {
    int max_merge = atoi(value);
    if (max_merge < 1 || max_merge > 5) {
@ -1466,6 +1471,11 @@ int kvz_config_validate(const kvz_config *const cfg)
 {
  int error = 0;

+  if (cfg->vaq < 0) {
+    fprintf(stderr, "vaq strength must be positive\n");
+    error = 1;
+  }
+
  if (cfg->width <= 0) {
    fprintf(stderr, "Input error: width must be positive\n");
    error = 1;
--- a/src/cli.c
+++ b/src/cli.c
@ -133,6 +133,8 @@ static const struct option long_options[] = {
  { "set-qp-in-cu",             no_argument, NULL, 0 },
  { "open-gop",                 no_argument, NULL, 0 },
  { "no-open-gop",              no_argument, NULL, 0 },
+  {	"vaq",                required_argument, NULL, 0 },
+  { "no-vaq",                   no_argument, NULL, 0 },
  { "scaling-list",       required_argument, NULL, 0 },
  { "max-merge",          required_argument, NULL, 0 },
  { "early-skip",               no_argument, NULL, 0 },
@ -457,6 +459,8 @@ void print_help(void)
    "      --high-tier            : Used with --level. Use high tier bitrate limits\n"
    "                               instead of the main tier limits during encoding.\n"
    "                               High tier requires level 4 or higher.\n"
+		"      --vaq <integer>        : Enable variance adaptive quantization with given\n"
+    "                               strength, in range 1..20.\n"
    "\n"
    /* Word wrap to this width to stay under 80 characters (including ") *************/
    "Compression tools:\n"
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -269,6 +269,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
    if (state->must_code_qp_delta) {
      const int qp_pred      = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
      const int qp_delta     = cur_cu->qp - qp_pred;
+      assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
+      assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
      const int qp_delta_abs = ABS(qp_delta);
      cabac_data_t* cabac    = &state->cabac;

--- a/src/encoder.c
+++ b/src/encoder.c
@ -376,7 +376,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
  // for SMP and AMP partition units.
  encoder->tr_depth_inter = 0;

-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
    encoder->max_qp_delta_depth = 0;
  } else {
    encoder->max_qp_delta_depth = -1;
--- a/src/encoder.h
+++ b/src/encoder.h
@ -55,7 +55,7 @@ typedef struct encoder_control_t
    int32_t width_in_lcu;
    int32_t height_in_lcu;
    int32_t real_width;  /*!< \brief real input picture width */
-    int32_t real_height; /*!< \brief real input picture width */
+    int32_t real_height; /*!< \brief real input picture height */
    int64_t pixels_per_pic;
    int8_t source_scan_type;
  } in;
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@ -59,6 +59,7 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
  const encoder_control_t * const encoder = state->encoder_control;
  const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
  state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
+  state->frame->aq_offsets = MALLOC(double, num_lcus);

  for (int y = 0; y < encoder->in.height_in_lcu; y++) {
    for (int x = 0; x < encoder->in.width_in_lcu; x++) {
@ -92,6 +93,7 @@ static void encoder_state_config_frame_finalize(encoder_state_t * const state) {

  kvz_image_list_destroy(state->frame->ref);
  FREE_POINTER(state->frame->lcu_stats);
+  FREE_POINTER(state->frame->aq_offsets);
 }

 static int encoder_state_config_tile_init(encoder_state_t * const state, 
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -37,6 +37,8 @@
 #include "tables.h"
 #include "threadqueue.h"

+#include "strategies/strategies-picture.h"
+

 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
  int i;
@ -1223,6 +1225,21 @@ static void normalize_lcu_weights(encoder_state_t * const state)
  }
 }

+// Check if lcu is edge lcu. Return false if frame dimensions are 64 divisible
+static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
+{
+  if (xdiv64 && ydiv64) {
+    return false;
+  }
+  int last_row_first_id = (lcus_y - 1) * lcus_x;
+  if ((id % lcus_x == lcus_x - 1 && !xdiv64) || (id >= last_row_first_id && !ydiv64)) {
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
  assert(state->type == ENCODER_STATE_TYPE_MAIN);

@ -1236,6 +1253,92 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
      state->tile->frame->height
  );

+  // Variance adaptive quantization
+  if (cfg->vaq) {
+    const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+    double d = cfg->vaq * 0.1; // Empirically decided constant. Affects delta-QP strength
+    
+    // Calculate frame pixel variance
+    uint32_t len = state->tile->frame->width * state->tile->frame->height;
+    uint32_t c_len = len / 4;
+    double frame_var = kvz_pixel_var(state->tile->frame->source->y, len);
+    if (has_chroma) {
+      frame_var += kvz_pixel_var(state->tile->frame->source->u, c_len);
+      frame_var += kvz_pixel_var(state->tile->frame->source->v, c_len);
+    }
+
+    // Loop through LCUs
+    // For each LCU calculate: D * (log(LCU pixel variance) - log(frame pixel variance))
+    unsigned x_lim = state->tile->frame->width_in_lcu;
+    unsigned y_lim = state->tile->frame->height_in_lcu;
+    
+    unsigned id = 0;
+    for (int y = 0; y < y_lim; ++y) {
+      for (int x = 0; x < x_lim; ++x) {
+        kvz_pixel tmp[LCU_LUMA_SIZE];
+        int pxl_x = x * LCU_WIDTH;
+        int pxl_y = y * LCU_WIDTH;
+        int x_max = MIN(pxl_x + LCU_WIDTH, frame->width) - pxl_x;
+        int y_max = MIN(pxl_y + LCU_WIDTH, frame->height) - pxl_y;
+        
+        bool xdiv64 = false;
+        bool ydiv64 = false;
+        if (frame->width % 64 == 0) xdiv64 = true;
+        if (frame->height % 64 == 0) ydiv64 = true;
+
+        // Luma variance
+        if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+          kvz_pixels_blit(&state->tile->frame->source->y[pxl_x + pxl_y * state->tile->frame->source->stride], tmp,
+            x_max, y_max, state->tile->frame->source->stride, LCU_WIDTH);
+        } else {
+          // Extend edge pixels for edge lcus
+          for (int y = 0; y < LCU_WIDTH; y++) {
+            for (int x = 0; x < LCU_WIDTH; x++) {
+              int src_y = CLIP(0, frame->height - 1, pxl_y + y);
+              int src_x = CLIP(0, frame->width - 1, pxl_x + x);
+              tmp[y * LCU_WIDTH + x] = state->tile->frame->source->y[src_y * state->tile->frame->source->stride + src_x];
+            }
+          }
+        }
+        
+        double lcu_var = kvz_pixel_var(tmp, LCU_LUMA_SIZE);
+
+        if (has_chroma) {
+          // Add chroma variance if not monochrome
+          int32_t c_stride = state->tile->frame->source->stride >> 1;
+          kvz_pixel chromau_tmp[LCU_CHROMA_SIZE];
+          kvz_pixel chromav_tmp[LCU_CHROMA_SIZE];
+          int lcu_chroma_width = LCU_WIDTH >> 1;
+          int c_pxl_x = x * lcu_chroma_width;
+          int c_pxl_y = y * lcu_chroma_width;
+          int c_x_max = MIN(c_pxl_x + lcu_chroma_width, frame->width >> 1) - c_pxl_x;
+          int c_y_max = MIN(c_pxl_y + lcu_chroma_width, frame->height >> 1) - c_pxl_y;
+
+          if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+            kvz_pixels_blit(&state->tile->frame->source->u[c_pxl_x + c_pxl_y * c_stride], chromau_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+            kvz_pixels_blit(&state->tile->frame->source->v[c_pxl_x + c_pxl_y * c_stride], chromav_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+          }
+          else {
+            for (int y = 0; y < lcu_chroma_width; y++) {
+              for (int x = 0; x < lcu_chroma_width; x++) {
+                int src_y = CLIP(0, (frame->height >> 1) - 1, c_pxl_y + y);
+                int src_x = CLIP(0, (frame->width >> 1) - 1, c_pxl_x + x);
+                chromau_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->u[src_y * c_stride + src_x];
+                chromav_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->v[src_y * c_stride + src_x];
+              }
+            }
+          }
+          lcu_var += kvz_pixel_var(chromau_tmp, LCU_CHROMA_SIZE);
+          lcu_var += kvz_pixel_var(chromav_tmp, LCU_CHROMA_SIZE);
+        }
+                
+        state->frame->aq_offsets[id] = d * (log(lcu_var) - log(frame_var));
+        id++; 
+      }
+    }
+  }
+  // Variance adaptive quantization - END
+
  // Use this flag to handle closed gop irap picture selection.
  // If set to true, irap is already set and we avoid
  // setting it based on the intra period
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -160,6 +160,11 @@ typedef struct encoder_state_config_frame_t {

  struct encoder_state_t const *previous_layer_state;

+  /**
+  * \brief Calculated adaptive QP offset for each LCU.
+  */
+  double *aq_offsets;
+
  /**
   * \brief Whether next NAL is the first NAL in the access unit.
   */
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -399,6 +399,8 @@ typedef struct kvz_config
  /** \brief Flag to enable/disable open GOP configuration */
  int8_t open_gop;

+	int32_t vaq; /** \brief Enable variance adaptive quantization*/
+
  /** \brief Type of scaling lists to use */
  int8_t scaling_list;

--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -637,8 +637,22 @@ static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
  return avg_bits;
 }

+static double qp_to_lambda(encoder_state_t* const state, int qp)
+{
+  const int shift_qp = 12;
+  double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);

-void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
+  // NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
+  //       SATD is currently always enabled for ME, so this has no effect.
+  // bool hadamard_me = true;
+  // if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
+  //   lambda *= 0.95;
+  // }
+
+  return lambda;
+}
+
+ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
  double bits = get_ctu_bits(state, pos);

  const encoder_control_t * const encoder = state->encoder_control;
@ -750,6 +764,26 @@ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
  ctu->qp = est_qp;
  ctu->lambda = est_lambda;
  ctu->i_cost = 0;
+
+  // Apply variance adaptive quantization
+  if (encoder->cfg.vaq) {
+    vector2d_t lcu = {
+      pos.x + state->tile->lcu_offset_x,
+      pos.y + state->tile->lcu_offset_y
+    };
+    int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
+    int aq_offset = round(state->frame->aq_offsets[id]);
+    state->qp += aq_offset;
+    // Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
+    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP_TO_QP(state->qp);
+    state->lambda = qp_to_lambda(state, state->qp);
+    state->lambda_sqrt = sqrt(state->lambda);
+
+    //ctu->qp = state->qp;
+    //ctu->lambda = state->lambda;
+  }
 }


@ -894,22 +928,6 @@ void kvz_update_after_picture(encoder_state_t * const state) {
  }
 }

-
-static double qp_to_lambda(encoder_state_t * const state, int qp)
-{
-  const int shift_qp = 12;
-  double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
-
-  // NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
-  //       SATD is currently always enabled for ME, so this has no effect.
-  // bool hadamard_me = true;
-  // if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
-  //   lambda *= 0.95;
-  // }
-
-  return lambda;
-}
-
 /**
 * \brief Allocate bits and set lambda and QP for the current picture.
 * \param state the main encoder state
@ -1049,4 +1067,21 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
    state->lambda      = state->frame->lambda;
    state->lambda_sqrt = sqrt(state->frame->lambda);
  }
+
+  // Apply variance adaptive quantization
+  if (ctrl->cfg.vaq) {
+    vector2d_t lcu = {
+      pos.x + state->tile->lcu_offset_x,
+      pos.y + state->tile->lcu_offset_y
+    };
+    int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
+    int aq_offset = round(state->frame->aq_offsets[id]);
+    state->qp += aq_offset;    
+    // Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
+    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP_TO_QP(state->qp);
+    state->lambda = qp_to_lambda(state, state->qp);
+    state->lambda_sqrt = sqrt(state->lambda);
+  }
 }
--- a/src/strategies/avx2/avx2_common_functions.h
+++ b/src/strategies/avx2/avx2_common_functions.h
@ -3,6 +3,30 @@

 #include <immintrin.h>

+// The calling convention used by MSVC on 32-bit builds will essentially
+// disallow functions to have more than 3 XMM/YMM parameters, because it
+// will not provide more than 8-byte param alignment, and only the first
+// three vector params will be carried in SIMD registers. Now the
+// vectorcall convention could probably be problematic in globally visible
+// funcitons, but likely not in static ones.
+#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
+  #define FIX_W32 __vectorcall
+#else
+  #define FIX_W32
+#endif
+
+// Non-inline functions defined in this header are likely to trigger a
+// warning for each module including this header that does NOT use them,
+// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
+// Tell 'em we actually want to do that, it's not an accident.
+#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
+  #define FIX_UNUSED __attribute__((unused))
+#else
+  #define FIX_UNUSED
+#endif
+
+#define FIX_NOINLINE FIX_W32 FIX_UNUSED
+
 /*
 * Reorder coefficients from raster to scan order
 * Fun fact: Once upon a time, doing this in a loop looked like this:
@ -111,4 +135,19 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
 }

+static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
+{
+  __m256i sum1 = v;
+  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum3 = _mm256_add_epi32        (sum1, sum2);
+  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
+  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);
+
+  __m128i sum8 = _mm256_castsi256_si128  (sum7);
+  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
+  return  sum9;
+}
+
 #endif
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -1051,6 +1051,181 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat
                                   pic_stride, ref_stride, left, right);
 }

+static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
+{
+  const float len_f  = (float)len;
+  const __m256i zero = _mm256_setzero_si256();
+
+  size_t i;
+  __m256i sums = zero;
+  for (i = 0; i + 31 < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+    __m256i curr_sum = _mm256_sad_epu8(curr, zero);
+            sums = _mm256_add_epi64(sums, curr_sum);
+  }
+  __m128i sum_lo = _mm256_castsi256_si128  (sums);
+  __m128i sum_hi = _mm256_extracti128_si256(sums,   1);
+  __m128i sum_3  = _mm_add_epi64           (sum_lo, sum_hi);
+  __m128i sum_4  = _mm_shuffle_epi32       (sum_3,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sum_5  = _mm_add_epi64           (sum_3,  sum_4);
+
+  int64_t sum    = _mm_cvtsi128_si64(sum_5);
+
+  // Remaining len mod 32 pixels
+  for (; i < len; ++i) {
+    sum += buf[i];
+  }
+
+  float   mean_f = (float)sum / len_f;
+  __m256  mean   = _mm256_set1_ps(mean_f);
+  __m256  accum  = _mm256_setzero_ps();
+
+  for (i = 0; i + 31 < len; i += 32) {
+    __m128i curr0    = _mm_loadl_epi64((const __m128i *)(buf + i +  0));
+    __m128i curr1    = _mm_loadl_epi64((const __m128i *)(buf + i +  8));
+    __m128i curr2    = _mm_loadl_epi64((const __m128i *)(buf + i + 16));
+    __m128i curr3    = _mm_loadl_epi64((const __m128i *)(buf + i + 24));
+
+    __m256i curr0_32 = _mm256_cvtepu8_epi32(curr0);
+    __m256i curr1_32 = _mm256_cvtepu8_epi32(curr1);
+    __m256i curr2_32 = _mm256_cvtepu8_epi32(curr2);
+    __m256i curr3_32 = _mm256_cvtepu8_epi32(curr3);
+
+    __m256  curr0_f  = _mm256_cvtepi32_ps  (curr0_32);
+    __m256  curr1_f  = _mm256_cvtepi32_ps  (curr1_32);
+    __m256  curr2_f  = _mm256_cvtepi32_ps  (curr2_32);
+    __m256  curr3_f  = _mm256_cvtepi32_ps  (curr3_32);
+
+    __m256  curr0_sd = _mm256_sub_ps       (curr0_f,  mean);
+    __m256  curr1_sd = _mm256_sub_ps       (curr1_f,  mean);
+    __m256  curr2_sd = _mm256_sub_ps       (curr2_f,  mean);
+    __m256  curr3_sd = _mm256_sub_ps       (curr3_f,  mean);
+
+    __m256  curr0_v  = _mm256_mul_ps       (curr0_sd, curr0_sd);
+    __m256  curr1_v  = _mm256_mul_ps       (curr1_sd, curr1_sd);
+    __m256  curr2_v  = _mm256_mul_ps       (curr2_sd, curr2_sd);
+    __m256  curr3_v  = _mm256_mul_ps       (curr3_sd, curr3_sd);
+
+    __m256  curr01   = _mm256_add_ps       (curr0_v,  curr1_v);
+    __m256  curr23   = _mm256_add_ps       (curr2_v,  curr3_v);
+    __m256  curr     = _mm256_add_ps       (curr01,   curr23);
+            accum    = _mm256_add_ps       (accum,    curr);
+  }
+  __m256d accum_d  = _mm256_castps_pd     (accum);
+  __m256d accum2_d = _mm256_permute4x64_pd(accum_d, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256  accum2   = _mm256_castpd_ps     (accum2_d);
+
+  __m256  accum3   = _mm256_add_ps        (accum,  accum2);
+  __m256  accum4   = _mm256_permute_ps    (accum3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256  accum5   = _mm256_add_ps        (accum3, accum4);
+  __m256  accum6   = _mm256_permute_ps    (accum5, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256  accum7   = _mm256_add_ps        (accum5, accum6);
+
+  float   var_sum  = _mm256_cvtss_f32     (accum7);
+
+  // Remaining len mod 32 pixels
+  for (; i < len; ++i) {
+    float diff = buf[i] - mean_f;
+    var_sum += diff * diff;
+  }
+
+  return  var_sum / len_f;
+}
+
+#ifdef INACCURATE_VARIANCE_CALCULATION
+
+// Assumes that u is a power of two
+static INLINE uint32_t ilog2(uint32_t u)
+{
+  return _tzcnt_u32(u);
+}
+
+// A B C D | E F G H (8x32b)
+//        ==>
+// A+B C+D | E+F G+H (4x64b)
+static __m256i hsum_epi32_to_epi64(const __m256i v)
+{
+  const __m256i zero    = _mm256_setzero_si256();
+        __m256i v_shufd = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+        __m256i sums_32 = _mm256_add_epi32    (v, v_shufd);
+        __m256i sums_64 = _mm256_blend_epi32  (sums_32, zero, 0xaa);
+  return        sums_64;
+}
+
+static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
+{
+  assert(sizeof(*buf) == 1);
+  assert((len & 31) == 0);
+
+  // Uses Q8.7 numbers to measure mean and deviation, so variances are Q16.14
+  const uint64_t sum_maxwid     = ilog2(len) + (8 * sizeof(*buf));
+  const __m128i normalize_sum   = _mm_cvtsi32_si128(sum_maxwid - 15); // Normalize mean to [0, 32767], so signed 16-bit subtraction never overflows
+  const __m128i debias_sum      = _mm_cvtsi32_si128(1 << (sum_maxwid - 16));
+  const float varsum_to_f       = 1.0f / (float)(1 << (14 + ilog2(len)));
+
+  const bool power_of_two = (len & (len - 1)) == 0;
+  if (sum_maxwid > 32 || sum_maxwid < 15 || !power_of_two) {
+    return pixel_var_avx2_largebuf(buf, len);
+  }
+
+  const __m256i zero      = _mm256_setzero_si256();
+  const __m256i himask_15 = _mm256_set1_epi16(0x7f00);
+
+  size_t i;
+  __m256i sums = zero;
+  for (i = 0; i < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+    __m256i curr_sum = _mm256_sad_epu8(curr, zero);
+            sums = _mm256_add_epi64(sums, curr_sum);
+  }
+  __m128i sum_lo = _mm256_castsi256_si128  (sums);
+  __m128i sum_hi = _mm256_extracti128_si256(sums,   1);
+  __m128i sum_3  = _mm_add_epi64           (sum_lo, sum_hi);
+  __m128i sum_4  = _mm_shuffle_epi32       (sum_3,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sum_5  = _mm_add_epi64           (sum_3,  sum_4);
+  __m128i sum_5n = _mm_srl_epi32           (sum_5,  normalize_sum);
+          sum_5n = _mm_add_epi32           (sum_5n, debias_sum);
+
+  __m256i sum_n  = _mm256_broadcastw_epi16 (sum_5n);
+
+  __m256i accum = zero;
+  for (i = 0; i < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+
+    __m256i curr0    = _mm256_slli_epi16  (curr,  7);
+    __m256i curr1    = _mm256_srli_epi16  (curr,  1);
+            curr0    = _mm256_and_si256   (curr0, himask_15);
+            curr1    = _mm256_and_si256   (curr1, himask_15);
+
+    __m256i dev0     = _mm256_sub_epi16   (curr0, sum_n);
+    __m256i dev1     = _mm256_sub_epi16   (curr1, sum_n);
+
+    __m256i vars0    = _mm256_madd_epi16  (dev0,  dev0);
+    __m256i vars1    = _mm256_madd_epi16  (dev1,  dev1);
+
+    __m256i varsum   = _mm256_add_epi32   (vars0, vars1);
+            varsum   = hsum_epi32_to_epi64(varsum);
+            accum    = _mm256_add_epi64   (accum, varsum);
+  }
+  __m256i accum2 = _mm256_permute4x64_epi64(accum,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i accum3 = _mm256_add_epi64        (accum,  accum2);
+  __m256i accum4 = _mm256_permute4x64_epi64(accum3, _MM_SHUFFLE(2, 3, 1, 0));
+  __m256i v_tot  = _mm256_add_epi64        (accum3, accum4);
+  __m128i vt128  = _mm256_castsi256_si128  (v_tot);
+  uint64_t vars  = _mm_cvtsi128_si64       (vt128);
+
+  return (float)vars * varsum_to_f;
+}
+
+#else // INACCURATE_VARIANCE_CALCULATION
+
+static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
+{
+  return pixel_var_avx2_largebuf(buf, len);
+}
+
+#endif // !INACCURATE_VARIANCE_CALCULATION
+
 #endif //COMPILE_INTEL_AVX2

 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
@ -1089,6 +1264,8 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
    success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
    success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);

+    success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2);
+
  }
 #endif
  return success;
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@ -26,6 +26,7 @@

 // Use a couple generic functions from here as a worst-case fallback
 #include "strategies/generic/sao_shared_generics.h"
+#include "strategies/avx2/avx2_common_functions.h"
 #include "strategies/missing-intel-intrinsics.h"
 #include "cu.h"
 #include "encoder.h"
@ -34,37 +35,10 @@
 #include "sao.h"
 #include "strategyselector.h"

-// The calling convention used by MSVC on 32-bit builds will essentially
-// disallow functions to have more than 3 XMM/YMM parameters, because it
-// will not provide more than 8-byte param alignment, and only the first
-// three vector params will be carried in SIMD registers. Now the
-// vectorcall convention could probably be problematic in globally visible
-// funcitons, but likely not in static ones.
-#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
-  #define FIX_W32 __vectorcall
-#else
-  #define FIX_W32
-#endif
-
 // These optimizations are based heavily on sao-generic.c.
 // Might be useful to check that if (when) this file
 // is difficult to understand.

-static int32_t FIX_W32 hsum_8x32b(const __m256i v)
-{
-  __m256i sum1 = v;
-  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i sum3 = _mm256_add_epi32        (sum1, sum2);
-  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
-  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
-  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);
-
-  __m128i sum8 = _mm256_castsi256_si128  (sum7);
-  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
-  return  sum9;
-}
-
 // Do the SIGN3 operation for the difference a-b
 static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
 {
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -675,6 +675,32 @@ static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_
  return result;
 }

+// Calculate pixel value variance. Takes in arrays of kvz_pixel
+static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
+{
+  double var = 0;
+  double arr_mean = 0;
+
+  // Calculate array mean
+  int i = 0;
+  double sum = 0;
+
+  for (; i < len; ++i) {
+    sum += arr[i];
+  }
+  arr_mean = sum / (double)len;
+
+  // Calculate array variance
+  for (i = 0; i < len; ++i) {
+    double tmp = (double)arr[i] - arr_mean;
+    var += tmp*tmp;
+  }
+
+  var /= len;
+
+  return var;
+}
+
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
  bool success = true;
@ -714,5 +740,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
  success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
  success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);

+  success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
+
  return success;
 }
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@ -67,6 +67,8 @@ get_optimized_sad_func *kvz_get_optimized_sad = 0;
 ver_sad_func *kvz_ver_sad = 0;
 hor_sad_func *kvz_hor_sad = 0;

+pixel_var_func *kvz_pixel_var = 0;
+

 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
  bool success = true;
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -138,6 +138,8 @@ typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
    bool predict_luma,
    bool predict_chroma);  

+typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
+
 // Declare function pointers.
 extern reg_sad_func * kvz_reg_sad;

@ -176,6 +178,8 @@ extern get_optimized_sad_func *kvz_get_optimized_sad;
 extern ver_sad_func *kvz_ver_sad;
 extern hor_sad_func *kvz_hor_sad;

+extern pixel_var_func *kvz_pixel_var;
+
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
 cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@ -211,6 +215,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
  {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
  {"ver_sad", (void**) &kvz_ver_sad}, \
  {"hor_sad", (void**) &kvz_hor_sad}, \
+  {"pixel_var", (void**) &kvz_pixel_var}, \



--- a/tests/test_tools.sh
+++ b/tests/test_tools.sh
@ -10,3 +10,6 @@ common_args='264x130 10 -p0 -r1 --threads=2 --wpp --owf=1 --rd=0'
 valgrind_test $common_args --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3
 valgrind_test $common_args --no-rdoq --no-signhide --subme=0
 valgrind_test $common_args --rdoq --no-deblock --no-sao --subme=0
+valgrind_test $common_args --vaq=8
+valgrind_test $common_args --vaq=8 --bitrate 3500
+valgrind_test $common_args --vaq=8 --rc-algorithm oba --bitrate 3500