From 3efaeede763ac5615607628881d3fea8fac050e6 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 19 Aug 2021 17:04:35 +0300
Subject: [PATCH 01/13] [alf] Define the strategy for
 alf_derive_classification_blk()

---
 build/kvazaar_lib/kvazaar_lib.vcxproj         |   4 +
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters |  12 +
 src/Makefile.am                               |   4 +
 src/alf.c                                     | 243 +--------------
 src/rdo.h                                     |   1 -
 src/strategies/generic/alf-generic.c          | 281 ++++++++++++++++++
 src/strategies/generic/alf-generic.h          |  31 ++
 src/strategies/strategies-alf.c               |  41 +++
 src/strategies/strategies-alf.h               |  56 ++++
 src/strategyselector.c                        |   5 +
 src/strategyselector.h                        |   2 +
 11 files changed, 437 insertions(+), 243 deletions(-)
 create mode 100644 src/strategies/generic/alf-generic.c
 create mode 100644 src/strategies/generic/alf-generic.h
 create mode 100644 src/strategies/strategies-alf.c
 create mode 100644 src/strategies/strategies-alf.h
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index dd08a6e4..ef459fe1 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -188,10 +188,12 @@
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\alf-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
+    <ClCompile Include="..\..\src\strategies\strategies-alf.c" />
     <ClCompile Include="..\..\src\strategies\strategies-encode.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
@@ -263,9 +265,11 @@
     <ClInclude Include="..\..\src\search_intra.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\generic\alf-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-alf.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index bc68cd26..bc207ca8 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -254,6 +254,12 @@
       <Filter>Reconstruction</Filter>
     </ClCompile>
     <ClCompile Include="..\..\src\strategies\strategies-encode.c" />
+    <ClCompile Include="..\..\src\strategies\generic\alf-generic.c">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\strategies-alf.c">
+      <Filter>Optimization\strategies</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -473,6 +479,12 @@
     <ClInclude Include="..\..\src\alf.h">
       <Filter>Reconstruction</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\strategies\generic\alf-generic.h">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\strategies-alf.h">
+      <Filter>Optimization\strategies</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
diff --git a/src/Makefile.am b/src/Makefile.am
index 6a13c407..4cdf6a34 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -117,6 +117,8 @@ libkvazaar_la_SOURCES = \
 	transform.h \
 	videoframe.c \
 	videoframe.h \
+  strategies/generic/alf-generic.c \
+	strategies/generic/alf-generic.h \
 	strategies/generic/dct-generic.c \
 	strategies/generic/dct-generic.h \
 	strategies/generic/intra-generic.c \
@@ -137,6 +139,8 @@ libkvazaar_la_SOURCES = \
 	strategies/optimized_sad_func_ptr_t.h \
 	strategies/generic/sao_shared_generics.h \
 	strategies/strategies-common.h \
+  strategies/strategies-alf.c \
+	strategies/strategies-alf.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
 	strategies/strategies-intra.c \
diff --git a/src/alf.c b/src/alf.c
index dec4d22e..9de923b3 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -10,7 +10,7 @@
 
 #include "cabac.h"
 #include "rdo.h"
-#include "strategies/strategies-sao.h"
+#include "strategies/strategies-alf.h"
 #include "kvz_math.h"
 #include "reshape.h"
 
@@ -5852,247 +5852,6 @@ static void alf_reconstruct(encoder_state_t * const state,
   }
 }
 
-static void alf_derive_classification_blk(encoder_state_t * const state,
-  const int shift,
-  const int n_height,
-  const int n_width,
-  const int blk_pos_x,
-  const int blk_pos_y,
-  const int blk_dst_x,
-  const int blk_dst_y,
-  const int vb_ctu_height,
-  int vb_pos)
-{
-  videoframe_t* const frame = state->tile->frame;
-  //int ***g_laplacian = state->tile->frame->alf_info->g_laplacian;
-  //alf_classifier **g_classifier = state->tile->frame->alf_info->g_classifier;
-  //CHECK((vb_ctu_height & (vb_ctu_height - 1)) != 0, "vb_ctu_height must be a power of 2");
-
-  static const int th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
-  int laplacian[NUM_DIRECTIONS][CLASSIFICATION_BLK_SIZE + 5][CLASSIFICATION_BLK_SIZE + 5];
-  memset(laplacian, 0, sizeof(laplacian));
-  alf_classifier **classifier = state->tile->frame->alf_info->classifier;
-
-  const int stride = frame->rec->stride;
-  kvz_pixel *src = state->tile->frame->rec->y;
-  const int max_activity = 15;
-
-  int fl = 2;
-  int fl_p1 = fl + 1;
-  int fl2 = 2 * fl;
-
-  int main_direction, secondary_direction, dir_temp_hv, dir_temp_d;
-  int pix_y;
-
-  int height = n_height + fl2;
-  int width = n_width + fl2;
-  int pos_x = blk_pos_x;
-  int pos_y = blk_pos_y;
-  int start_height = pos_y - fl_p1;
-
-  for (int i = 0; i < height; i += 2)
-  {
-    int yoffset = (i + 1 + start_height) * stride - fl_p1;
-    const kvz_pixel *src0 = &src[yoffset - stride];
-    const kvz_pixel *src1 = &src[yoffset];
-    const kvz_pixel *src2 = &src[yoffset + stride];
-    const kvz_pixel *src3 = &src[yoffset + stride * 2];
-
-    const int y = blk_dst_y - 2 + i;
-    if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos - 2)
-    {
-      src3 = &src[yoffset + stride];
-    }
-    else if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos)
-    {
-      src0 = &src[yoffset];
-    }
-
-    int *p_y_ver = laplacian[ALF_VER][i];
-    int *p_y_hor = laplacian[ALF_HOR][i];
-    int *p_y_dig_0 = laplacian[ALF_DIAG0][i];
-    int *p_y_dig_1 = laplacian[ALF_DIAG1][i];
-
-    for (int j = 0; j < width; j += 2)
-    {
-      pix_y = j + 1 + pos_x;
-      const kvz_pixel *p_y = src1 + pix_y;
-      const kvz_pixel *p_y_down = src0 + pix_y;
-      const kvz_pixel *p_y_up = src2 + pix_y;
-      const kvz_pixel *p_y_up2 = src3 + pix_y;
-
-      const int16_t y0 = p_y[0] << 1;
-      const int16_t y_up1 = p_y_up[1] << 1;
-
-      p_y_ver[j] = abs(y0 - p_y_down[0] - p_y_up[0]) + abs(y_up1 - p_y[1] - p_y_up2[1]);
-      p_y_hor[j] = abs(y0 - p_y[1] - p_y[-1]) + abs(y_up1 - p_y_up[2] - p_y_up[0]);
-      p_y_dig_0[j] = abs(y0 - p_y_down[-1] - p_y_up[1]) + abs(y_up1 - p_y[0] - p_y_up2[2]);
-      p_y_dig_1[j] = abs(y0 - p_y_up[-1] - p_y_down[1]) + abs(y_up1 - p_y_up2[0] - p_y[2]);
-
-      if (j > 4 && (j - 6) % 4 == 0)
-      {
-        int j_m_6 = j - 6;
-        int j_m_4 = j - 4;
-        int j_m_2 = j - 2;
-
-        p_y_ver[j_m_6] += p_y_ver[j_m_4] + p_y_ver[j_m_2] + p_y_ver[j];
-        p_y_hor[j_m_6] += p_y_hor[j_m_4] + p_y_hor[j_m_2] + p_y_hor[j];
-        p_y_dig_0[j_m_6] += p_y_dig_0[j_m_4] + p_y_dig_0[j_m_2] + p_y_dig_0[j];
-        p_y_dig_1[j_m_6] += p_y_dig_1[j_m_4] + p_y_dig_1[j_m_2] + p_y_dig_1[j];
-      }
-    }
-  }
-
-  // classification block size
-  const int cls_size_y = 4;
-  const int cls_size_x = 4;
-
-  //for (int i = 0; i < blk.height; i += cls_size_y)
-  for (int i = 0; i < n_height; i += cls_size_y)
-  {
-    int* p_y_ver = laplacian[ALF_VER][i];
-    int* p_y_ver2 = laplacian[ALF_VER][i + 2];
-    int* p_y_ver4 = laplacian[ALF_VER][i + 4];
-    int* p_y_ver6 = laplacian[ALF_VER][i + 6];
-
-    int* p_y_hor = laplacian[ALF_HOR][i];
-    int* p_y_hor2 = laplacian[ALF_HOR][i + 2];
-    int* p_y_hor4 = laplacian[ALF_HOR][i + 4];
-    int* p_y_hor6 = laplacian[ALF_HOR][i + 6];
-
-    int* p_y_dig0 = laplacian[ALF_DIAG0][i];
-    int* p_y_dig02 = laplacian[ALF_DIAG0][i + 2];
-    int* p_y_dig04 = laplacian[ALF_DIAG0][i + 4];
-    int* p_y_dig06 = laplacian[ALF_DIAG0][i + 6];
-
-    int* p_y_dig1 = laplacian[ALF_DIAG1][i];
-    int* p_y_dig12 = laplacian[ALF_DIAG1][i + 2];
-    int* p_y_dig14 = laplacian[ALF_DIAG1][i + 4];
-    int* p_y_dig16 = laplacian[ALF_DIAG1][i + 6];
-
-    //for (int j = 0; j < blk.width; j += cls_size_x)
-    for (int j = 0; j < n_width; j += cls_size_x)
-    {
-      int sum_v = 0; int sum_h = 0; int sum_d0 = 0; int sum_d1 = 0;
-
-      if (((i + blk_dst_y) % vb_ctu_height) == (vb_pos - 4))
-      {
-        sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j];
-        sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j];
-        sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j];
-        sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j];
-      }
-      else if (((i + blk_dst_y) % vb_ctu_height) == vb_pos)
-      {
-        sum_v = p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j];
-        sum_h = p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j];
-        sum_d0 = p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j];
-        sum_d1 = p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j];
-      }
-      else
-      {
-        sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j];
-        sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j];
-        sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j];
-        sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j];
-      }
-
-      int temp_act = sum_v + sum_h;
-      int activity = 0;
-
-      const int y = (i + blk_dst_y) & (vb_ctu_height - 1);
-      if (y == vb_pos - 4 || y == vb_pos)
-      {
-        activity = CLIP(0, max_activity, (temp_act * 96) >> shift);
-      }
-      else
-      {
-        activity = CLIP(0, max_activity, (temp_act * 64) >> shift);
-      }
-
-      int class_idx = th[activity];
-
-      int hv1, hv0, d1, d0, hvd1, hvd0;
-
-      if (sum_v > sum_h)
-      {
-        hv1 = sum_v;
-        hv0 = sum_h;
-        dir_temp_hv = 1;
-      }
-      else
-      {
-        hv1 = sum_h;
-        hv0 = sum_v;
-        dir_temp_hv = 3;
-      }
-      if (sum_d0 > sum_d1)
-      {
-        d1 = sum_d0;
-        d0 = sum_d1;
-        dir_temp_d = 0;
-      }
-      else
-      {
-        d1 = sum_d1;
-        d0 = sum_d0;
-        dir_temp_d = 2;
-      }
-      if ((uint32_t)d1 * (uint32_t)hv0 > (uint32_t)hv1 * (uint32_t)d0)
-      {
-        hvd1 = d1;
-        hvd0 = d0;
-        main_direction = dir_temp_d;
-        secondary_direction = dir_temp_hv;
-      }
-      else
-      {
-        hvd1 = hv1;
-        hvd0 = hv0;
-        main_direction = dir_temp_hv;
-        secondary_direction = dir_temp_d;
-      }
-
-      int direction_strength = 0;
-      if (hvd1 > 2 * hvd0)
-      {
-        direction_strength = 1;
-      }
-      if (hvd1 * 2 > 9 * hvd0)
-      {
-        direction_strength = 2;
-      }
-
-      if (direction_strength)
-      {
-        class_idx += (((main_direction & 0x1) << 1) + direction_strength) * 5;
-      }
-
-      static const int transpose_table[8] = { 0, 1, 0, 2, 2, 3, 1, 3 };
-      int transpose_idx = transpose_table[main_direction * 2 + (secondary_direction >> 1)];
-
-      int y_offset = i + blk_dst_y;
-      int x_offset = j + blk_dst_x;
-
-      alf_classifier *cl0 = classifier[y_offset] + x_offset;
-      alf_classifier *cl1 = classifier[y_offset + 1] + x_offset;
-      alf_classifier *cl2 = classifier[y_offset + 2] + x_offset;
-      alf_classifier *cl3 = classifier[y_offset + 3] + x_offset;
-
-      cl0[0].class_idx = cl0[1].class_idx = cl0[2].class_idx = cl0[3].class_idx =
-        cl1[0].class_idx = cl1[1].class_idx = cl1[2].class_idx = cl1[3].class_idx =
-        cl2[0].class_idx = cl2[1].class_idx = cl2[2].class_idx = cl2[3].class_idx =
-        cl3[0].class_idx = cl3[1].class_idx = cl3[2].class_idx = cl3[3].class_idx = class_idx;
-
-      cl0[0].transpose_idx = cl0[1].transpose_idx = cl0[2].transpose_idx = cl0[3].transpose_idx =
-        cl1[0].transpose_idx = cl1[1].transpose_idx = cl1[2].transpose_idx = cl1[3].transpose_idx =
-        cl2[0].transpose_idx = cl2[1].transpose_idx = cl2[2].transpose_idx = cl2[3].transpose_idx =
-        cl3[0].transpose_idx = cl3[1].transpose_idx = cl3[2].transpose_idx = cl3[3].transpose_idx = transpose_idx;
-
-    }
-  }
-}
-
 static void alf_derive_classification(encoder_state_t * const state,
   const int width,
   const int height,
diff --git a/src/rdo.h b/src/rdo.h
index 70055fee..91db4d34 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -80,7 +80,6 @@ extern const uint32_t kvz_entropy_bits[512];
 
 // Floating point fractional bits, derived from kvz_entropy_bits
 extern const float kvz_f_entropy_bits[512];
-// ToDo: generate a new table for VVC?
 #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
 
 #endif
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
new file mode 100644
index 00000000..35d622a7
--- /dev/null
+++ b/src/strategies/generic/alf-generic.c
@@ -0,0 +1,281 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/generic/alf-generic.h"
+
+#include "cu.h"
+#include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "alf.h"
+#include "strategyselector.h"
+
+
+static void alf_derive_classification_blk_generic(encoder_state_t * const state,
+  const int shift,
+  const int n_height,
+  const int n_width,
+  const int blk_pos_x,
+  const int blk_pos_y,
+  const int blk_dst_x,
+  const int blk_dst_y,
+  const int vb_ctu_height,
+  int vb_pos)
+{
+  videoframe_t* const frame = state->tile->frame;
+  //int ***g_laplacian = state->tile->frame->alf_info->g_laplacian;
+  //alf_classifier **g_classifier = state->tile->frame->alf_info->g_classifier;
+  //CHECK((vb_ctu_height & (vb_ctu_height - 1)) != 0, "vb_ctu_height must be a power of 2");
+
+  static const int th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
+  int laplacian[NUM_DIRECTIONS][CLASSIFICATION_BLK_SIZE + 5][CLASSIFICATION_BLK_SIZE + 5];
+  memset(laplacian, 0, sizeof(laplacian));
+  alf_classifier **classifier = state->tile->frame->alf_info->classifier;
+
+  const int stride = frame->rec->stride;
+  kvz_pixel *src = state->tile->frame->rec->y;
+  const int max_activity = 15;
+
+  int fl = 2;
+  int fl_p1 = fl + 1;
+  int fl2 = 2 * fl;
+
+  int main_direction, secondary_direction, dir_temp_hv, dir_temp_d;
+  int pix_y;
+
+  int height = n_height + fl2;
+  int width = n_width + fl2;
+  int pos_x = blk_pos_x;
+  int pos_y = blk_pos_y;
+  int start_height = pos_y - fl_p1;
+
+  for (int i = 0; i < height; i += 2)
+  {
+    int yoffset = (i + 1 + start_height) * stride - fl_p1;
+    const kvz_pixel *src0 = &src[yoffset - stride];
+    const kvz_pixel *src1 = &src[yoffset];
+    const kvz_pixel *src2 = &src[yoffset + stride];
+    const kvz_pixel *src3 = &src[yoffset + stride * 2];
+
+    const int y = blk_dst_y - 2 + i;
+    if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos - 2)
+    {
+      src3 = &src[yoffset + stride];
+    }
+    else if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos)
+    {
+      src0 = &src[yoffset];
+    }
+
+    int *p_y_ver = laplacian[ALF_VER][i];
+    int *p_y_hor = laplacian[ALF_HOR][i];
+    int *p_y_dig_0 = laplacian[ALF_DIAG0][i];
+    int *p_y_dig_1 = laplacian[ALF_DIAG1][i];
+
+    for (int j = 0; j < width; j += 2)
+    {
+      pix_y = j + 1 + pos_x;
+      const kvz_pixel *p_y = src1 + pix_y;
+      const kvz_pixel *p_y_down = src0 + pix_y;
+      const kvz_pixel *p_y_up = src2 + pix_y;
+      const kvz_pixel *p_y_up2 = src3 + pix_y;
+
+      const int16_t y0 = p_y[0] << 1;
+      const int16_t y_up1 = p_y_up[1] << 1;
+
+      p_y_ver[j] = abs(y0 - p_y_down[0] - p_y_up[0]) + abs(y_up1 - p_y[1] - p_y_up2[1]);
+      p_y_hor[j] = abs(y0 - p_y[1] - p_y[-1]) + abs(y_up1 - p_y_up[2] - p_y_up[0]);
+      p_y_dig_0[j] = abs(y0 - p_y_down[-1] - p_y_up[1]) + abs(y_up1 - p_y[0] - p_y_up2[2]);
+      p_y_dig_1[j] = abs(y0 - p_y_up[-1] - p_y_down[1]) + abs(y_up1 - p_y_up2[0] - p_y[2]);
+
+      if (j > 4 && (j - 6) % 4 == 0)
+      {
+        int j_m_6 = j - 6;
+        int j_m_4 = j - 4;
+        int j_m_2 = j - 2;
+
+        p_y_ver[j_m_6] += p_y_ver[j_m_4] + p_y_ver[j_m_2] + p_y_ver[j];
+        p_y_hor[j_m_6] += p_y_hor[j_m_4] + p_y_hor[j_m_2] + p_y_hor[j];
+        p_y_dig_0[j_m_6] += p_y_dig_0[j_m_4] + p_y_dig_0[j_m_2] + p_y_dig_0[j];
+        p_y_dig_1[j_m_6] += p_y_dig_1[j_m_4] + p_y_dig_1[j_m_2] + p_y_dig_1[j];
+      }
+    }
+  }
+
+  // classification block size
+  const int cls_size_y = 4;
+  const int cls_size_x = 4;
+
+  //for (int i = 0; i < blk.height; i += cls_size_y)
+  for (int i = 0; i < n_height; i += cls_size_y)
+  {
+    int* p_y_ver = laplacian[ALF_VER][i];
+    int* p_y_ver2 = laplacian[ALF_VER][i + 2];
+    int* p_y_ver4 = laplacian[ALF_VER][i + 4];
+    int* p_y_ver6 = laplacian[ALF_VER][i + 6];
+
+    int* p_y_hor = laplacian[ALF_HOR][i];
+    int* p_y_hor2 = laplacian[ALF_HOR][i + 2];
+    int* p_y_hor4 = laplacian[ALF_HOR][i + 4];
+    int* p_y_hor6 = laplacian[ALF_HOR][i + 6];
+
+    int* p_y_dig0 = laplacian[ALF_DIAG0][i];
+    int* p_y_dig02 = laplacian[ALF_DIAG0][i + 2];
+    int* p_y_dig04 = laplacian[ALF_DIAG0][i + 4];
+    int* p_y_dig06 = laplacian[ALF_DIAG0][i + 6];
+
+    int* p_y_dig1 = laplacian[ALF_DIAG1][i];
+    int* p_y_dig12 = laplacian[ALF_DIAG1][i + 2];
+    int* p_y_dig14 = laplacian[ALF_DIAG1][i + 4];
+    int* p_y_dig16 = laplacian[ALF_DIAG1][i + 6];
+
+    //for (int j = 0; j < blk.width; j += cls_size_x)
+    for (int j = 0; j < n_width; j += cls_size_x)
+    {
+      int sum_v = 0; int sum_h = 0; int sum_d0 = 0; int sum_d1 = 0;
+
+      if (((i + blk_dst_y) % vb_ctu_height) == (vb_pos - 4))
+      {
+        sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j];
+        sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j];
+        sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j];
+        sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j];
+      }
+      else if (((i + blk_dst_y) % vb_ctu_height) == vb_pos)
+      {
+        sum_v = p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j];
+        sum_h = p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j];
+        sum_d0 = p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j];
+        sum_d1 = p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j];
+      }
+      else
+      {
+        sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j];
+        sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j];
+        sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j];
+        sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j];
+      }
+
+      int temp_act = sum_v + sum_h;
+      int activity = 0;
+
+      const int y = (i + blk_dst_y) & (vb_ctu_height - 1);
+      if (y == vb_pos - 4 || y == vb_pos)
+      {
+        activity = CLIP(0, max_activity, (temp_act * 96) >> shift);
+      }
+      else
+      {
+        activity = CLIP(0, max_activity, (temp_act * 64) >> shift);
+      }
+
+      int class_idx = th[activity];
+
+      int hv1, hv0, d1, d0, hvd1, hvd0;
+
+      if (sum_v > sum_h)
+      {
+        hv1 = sum_v;
+        hv0 = sum_h;
+        dir_temp_hv = 1;
+      }
+      else
+      {
+        hv1 = sum_h;
+        hv0 = sum_v;
+        dir_temp_hv = 3;
+      }
+      if (sum_d0 > sum_d1)
+      {
+        d1 = sum_d0;
+        d0 = sum_d1;
+        dir_temp_d = 0;
+      }
+      else
+      {
+        d1 = sum_d1;
+        d0 = sum_d0;
+        dir_temp_d = 2;
+      }
+      if ((uint32_t)d1 * (uint32_t)hv0 > (uint32_t)hv1 * (uint32_t)d0)
+      {
+        hvd1 = d1;
+        hvd0 = d0;
+        main_direction = dir_temp_d;
+        secondary_direction = dir_temp_hv;
+      }
+      else
+      {
+        hvd1 = hv1;
+        hvd0 = hv0;
+        main_direction = dir_temp_hv;
+        secondary_direction = dir_temp_d;
+      }
+
+      int direction_strength = 0;
+      if (hvd1 > 2 * hvd0)
+      {
+        direction_strength = 1;
+      }
+      if (hvd1 * 2 > 9 * hvd0)
+      {
+        direction_strength = 2;
+      }
+
+      if (direction_strength)
+      {
+        class_idx += (((main_direction & 0x1) << 1) + direction_strength) * 5;
+      }
+
+      static const int transpose_table[8] = { 0, 1, 0, 2, 2, 3, 1, 3 };
+      int transpose_idx = transpose_table[main_direction * 2 + (secondary_direction >> 1)];
+
+      int y_offset = i + blk_dst_y;
+      int x_offset = j + blk_dst_x;
+
+      alf_classifier *cl0 = classifier[y_offset] + x_offset;
+      alf_classifier *cl1 = classifier[y_offset + 1] + x_offset;
+      alf_classifier *cl2 = classifier[y_offset + 2] + x_offset;
+      alf_classifier *cl3 = classifier[y_offset + 3] + x_offset;
+
+      cl0[0].class_idx = cl0[1].class_idx = cl0[2].class_idx = cl0[3].class_idx =
+        cl1[0].class_idx = cl1[1].class_idx = cl1[2].class_idx = cl1[3].class_idx =
+        cl2[0].class_idx = cl2[1].class_idx = cl2[2].class_idx = cl2[3].class_idx =
+        cl3[0].class_idx = cl3[1].class_idx = cl3[2].class_idx = cl3[3].class_idx = class_idx;
+
+      cl0[0].transpose_idx = cl0[1].transpose_idx = cl0[2].transpose_idx = cl0[3].transpose_idx =
+        cl1[0].transpose_idx = cl1[1].transpose_idx = cl1[2].transpose_idx = cl1[3].transpose_idx =
+        cl2[0].transpose_idx = cl2[1].transpose_idx = cl2[2].transpose_idx = cl2[3].transpose_idx =
+        cl3[0].transpose_idx = cl3[1].transpose_idx = cl3[2].transpose_idx = cl3[3].transpose_idx = transpose_idx;
+
+    }
+  }
+}
+
+
+
+int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
+
+  return success;
+}
diff --git a/src/strategies/generic/alf-generic.h b/src/strategies/generic/alf-generic.h
new file mode 100644
index 00000000..edbb9d94
--- /dev/null
+++ b/src/strategies/generic/alf-generic.h
@@ -0,0 +1,31 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth);
+
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
new file mode 100644
index 00000000..d6da9332
--- /dev/null
+++ b/src/strategies/strategies-alf.c
@@ -0,0 +1,41 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/strategies-alf.h"
+//#include "strategies/avx2/alf-avx2.h"
+#include "strategies/generic/alf-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+alf_derive_classification_blk_func* alf_derive_classification_blk;
+
+
+int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+
+  success &= kvz_strategy_register_alf_generic(opaque, bitdepth);
+
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    //success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
+  }
+
+  return success;
+}
\ No newline at end of file
diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h
new file mode 100644
index 00000000..563697c4
--- /dev/null
+++ b/src/strategies/strategies-alf.h
@@ -0,0 +1,56 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "alf.h"
+
+
+// Declare function pointers.
+typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state,
+  const int shift,
+  const int n_height,
+  const int n_width,
+  const int blk_pos_x,
+  const int blk_pos_y,
+  const int blk_dst_x,
+  const int blk_dst_y,
+  const int vb_ctu_height,
+  int vb_pos);
+
+// Declare function pointers.
+extern alf_derive_classification_blk_func * alf_derive_classification_blk;
+
+int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_ALF_EXPORTS \
+  {"alf_derive_classification_blk", (void**) &alf_derive_classification_blk}, \
+ 
+
diff --git a/src/strategyselector.c b/src/strategyselector.c
index fc97635b..80281940 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -90,6 +90,11 @@ int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
     fprintf(stderr, "kvz_strategy_register_encode failed!\n");
     return 0;
   }
+
+  if (!kvz_strategy_register_alf(&strategies, bitdepth)) {
+    fprintf(stderr, "kvz_strategy_register_encode failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
diff --git a/src/strategyselector.h b/src/strategyselector.h
index 575accc3..c4820153 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -96,6 +96,7 @@ int kvz_strategyselector_register(void *opaque, const char *type, const char *st
 #include "strategies/strategies-intra.h"
 #include "strategies/strategies-sao.h"
 #include "strategies/strategies-encode.h"
+#include "strategies/strategies-alf.h"
 
 static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_NAL_EXPORTS
@@ -106,6 +107,7 @@ static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_INTRA_EXPORTS
   STRATEGIES_SAO_EXPORTS
   STRATEGIES_ENCODE_EXPORTS
+  STRATEGIES_ALF_EXPORTS
   { NULL, NULL },
 };
 

From b158d05bcad1d5c004294fbdb3eef8efaa8547a2 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 19 Aug 2021 17:19:17 +0300
Subject: [PATCH 02/13] [alf] rename strategy function to include prefix

---
 src/alf.c                       | 2 +-
 src/strategies/strategies-alf.c | 2 +-
 src/strategies/strategies-alf.h | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/alf.c b/src/alf.c
index 9de923b3..8b2d876b 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -5899,7 +5899,7 @@ static void alf_derive_classification(encoder_state_t * const state,
     {
       int n_width = MIN(j + CLASSIFICATION_BLK_SIZE, max_width) - j;
 
-      alf_derive_classification_blk(state, state->encoder_control->cfg.input_bitdepth + 4, n_height, n_width, j, i,
+      kvz_alf_derive_classification_blk(state, state->encoder_control->cfg.input_bitdepth + 4, n_height, n_width, j, i,
         j - x_pos + blk_dst_x, i - y_pos + blk_dst_y,
         alf_vb_luma_ctu_height,
         alf_vb_luma_pos);
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
index d6da9332..6bb7645e 100644
--- a/src/strategies/strategies-alf.c
+++ b/src/strategies/strategies-alf.c
@@ -25,7 +25,7 @@
 
 
 // Define function pointers.
-alf_derive_classification_blk_func* alf_derive_classification_blk;
+alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
 
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h
index 563697c4..8c9c2924 100644
--- a/src/strategies/strategies-alf.h
+++ b/src/strategies/strategies-alf.h
@@ -45,12 +45,12 @@ typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state,
   int vb_pos);
 
 // Declare function pointers.
-extern alf_derive_classification_blk_func * alf_derive_classification_blk;
+extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
 
 
 #define STRATEGIES_ALF_EXPORTS \
-  {"alf_derive_classification_blk", (void**) &alf_derive_classification_blk}, \
+  {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
  
 

From c3c96d69c2f3293dfeed41c97390325758aa5eb7 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 20 Aug 2021 11:45:02 +0300
Subject: [PATCH 03/13] [alf] Add modified
 alf_derive_classification_blk_sse41() from VTM 13.0  * Modified to work with
 bitdepth 8

---
 build/kvazaar_lib/kvazaar_lib.vcxproj         |   4 +-
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters |   6 +
 src/Makefile.am                               |   8 +-
 src/strategies/sse41/alf-sse41.c              | 356 ++++++++++++++++++
 src/strategies/sse41/alf-sse41.h              |  32 ++
 src/strategies/strategies-alf.c               |   6 +-
 6 files changed, 405 insertions(+), 7 deletions(-)
 create mode 100644 src/strategies/sse41/alf-sse41.c
 create mode 100644 src/strategies/sse41/alf-sse41.h

diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index ef459fe1..2ac2e406 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -193,6 +193,7 @@
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
+    <ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c" />
     <ClCompile Include="..\..\src\strategies\strategies-alf.c" />
     <ClCompile Include="..\..\src\strategies\strategies-encode.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
@@ -269,6 +270,7 @@
     <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
+    <ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h" />
     <ClInclude Include="..\..\src\strategies\strategies-alf.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
@@ -343,4 +345,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Project="..\yasm\vsyasm.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index bc207ca8..87f212e0 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -260,6 +260,9 @@
     <ClCompile Include="..\..\src\strategies\strategies-alf.c">
       <Filter>Optimization\strategies</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c">
+      <Filter>Optimization\strategies\sse41</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -485,6 +488,9 @@
     <ClInclude Include="..\..\src\strategies\strategies-alf.h">
       <Filter>Optimization\strategies</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h">
+      <Filter>Optimization\strategies\sse41</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
diff --git a/src/Makefile.am b/src/Makefile.am
index 4cdf6a34..af098c39 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -117,7 +117,7 @@ libkvazaar_la_SOURCES = \
 	transform.h \
 	videoframe.c \
 	videoframe.h \
-  strategies/generic/alf-generic.c \
+	strategies/generic/alf-generic.c \
 	strategies/generic/alf-generic.h \
 	strategies/generic/dct-generic.c \
 	strategies/generic/dct-generic.h \
@@ -139,7 +139,7 @@ libkvazaar_la_SOURCES = \
 	strategies/optimized_sad_func_ptr_t.h \
 	strategies/generic/sao_shared_generics.h \
 	strategies/strategies-common.h \
-  strategies/strategies-alf.c \
+	strategies/strategies-alf.c \
 	strategies/strategies-alf.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -207,7 +207,9 @@ libsse2_la_SOURCES = \
 libsse41_la_SOURCES = \
 	strategies/sse41/picture-sse41.c \
 	strategies/sse41/picture-sse41.h \
-	strategies/sse41/reg_sad_pow2_widths-sse41.h
+	strategies/sse41/reg_sad_pow2_widths-sse41.h \
+	strategies/sse41/alf-sse41.c \
+	strategies/sse41/alf-sse41.h
 
 if HAVE_PPC
 
diff --git a/src/strategies/sse41/alf-sse41.c b/src/strategies/sse41/alf-sse41.c
new file mode 100644
index 00000000..2d7ded14
--- /dev/null
+++ b/src/strategies/sse41/alf-sse41.c
@@ -0,0 +1,356 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "global.h"
+
+#if COMPILE_INTEL_SSE41
+#include "kvazaar.h"
+#if KVZ_BIT_DEPTH == 8
+#include "strategies/sse41/alf-sse41.h"
+
+#include <immintrin.h>
+#include <stdlib.h>
+
+#include "strategyselector.h"
+
+static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
+  const int shift,
+  const int n_height,
+  const int n_width,
+  const int blk_pos_x,
+  const int blk_pos_y,
+  const int blk_dst_x,
+  const int blk_dst_y,
+  const int vb_ctu_height,
+  int vb_pos)
+{
+  videoframe_t* const frame = state->tile->frame;
+  const size_t imgStride = frame->rec->stride;
+  const kvz_pixel *  srcExt    = state->tile->frame->rec->y;
+
+  const int imgHExtended = n_height + 4;
+  const int imgWExtended = n_width + 4;
+
+  const int posX = blk_pos_x;
+  const int posY = blk_pos_y;
+
+  alf_classifier** classifier = state->tile->frame->alf_info->classifier;
+
+  // 18x40 array
+  uint16_t colSums[(CLASSIFICATION_BLK_SIZE + 4) >> 1]
+                  [CLASSIFICATION_BLK_SIZE + 8];
+
+  for (int i = 0; i < imgHExtended; i += 2)
+  {
+    const size_t offset = (i + posY - 3) * imgStride + posX - 3;
+
+    const kvz_pixel*imgY0 = &srcExt[offset];
+    const kvz_pixel*imgY1 = &srcExt[offset + imgStride];
+    const kvz_pixel*imgY2 = &srcExt[offset + imgStride * 2];
+    const kvz_pixel*imgY3 = &srcExt[offset + imgStride * 3];
+
+    // pixel padding for gradient calculation
+    int pos      = blk_dst_y - 2 + i;
+    int posInCTU = pos & (vb_ctu_height - 1);
+    if (pos > 0 && posInCTU == vb_pos - 2)
+    {
+      imgY3 = imgY2;
+    }
+    else if (pos > 0 && posInCTU == vb_pos)
+    {
+      imgY0 = imgY1;
+    }
+
+    __m128i prev = _mm_setzero_si128();
+
+    for (int j = 0; j < imgWExtended; j += 8)
+    {
+      const __m128i x00 = _mm_loadu_si128((const __m128i*) (imgY0 + j));
+      const __m128i x01 = _mm_loadu_si128((const __m128i*) (imgY1 + j));
+      const __m128i x02 = _mm_loadu_si128((const __m128i*) (imgY2 + j));
+      const __m128i x03 = _mm_loadu_si128((const __m128i*) (imgY3 + j));
+
+      const __m128i x04 = _mm_loadu_si128((const __m128i*) (imgY0 + j + 2));
+      const __m128i x05 = _mm_loadu_si128((const __m128i*) (imgY1 + j + 2));
+      const __m128i x06 = _mm_loadu_si128((const __m128i*) (imgY2 + j + 2));
+      const __m128i x07 = _mm_loadu_si128((const __m128i*) (imgY3 + j + 2));
+
+      const __m128i x0 = _mm_unpacklo_epi8(x00, _mm_setzero_si128());
+      const __m128i x1 = _mm_unpacklo_epi8(x01, _mm_setzero_si128());
+      const __m128i x2 = _mm_unpacklo_epi8(x02, _mm_setzero_si128());
+      const __m128i x3 = _mm_unpacklo_epi8(x03, _mm_setzero_si128());
+
+      const __m128i x4 = _mm_unpacklo_epi8(x04, _mm_setzero_si128());
+      const __m128i x5 = _mm_unpacklo_epi8(x05, _mm_setzero_si128());
+      const __m128i x6 = _mm_unpacklo_epi8(x06, _mm_setzero_si128());
+      const __m128i x7 = _mm_unpacklo_epi8(x07, _mm_setzero_si128());
+
+      const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa);
+      const __m128i n = _mm_blend_epi16(x0, x5, 0x55);
+      const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa);
+      const __m128i w = _mm_blend_epi16(x1, x2, 0xaa);
+      const __m128i e = _mm_blend_epi16(x5, x6, 0xaa);
+      const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa);
+      const __m128i s = _mm_blend_epi16(x2, x7, 0x55);
+      const __m128i se = _mm_blend_epi16(x6, x7, 0xaa);
+
+      __m128i c = _mm_blend_epi16(x1, x6, 0x55);
+      c = _mm_add_epi16(c, c);
+      __m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13));
+
+      const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s)));
+      const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e)));
+      const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se)));
+      const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw)));
+
+      const __m128i hv = _mm_hadd_epi16(ver, hor);
+      const __m128i di = _mm_hadd_epi16(di0, di1);
+      const __m128i all = _mm_hadd_epi16(hv, di);
+
+      const __m128i t = _mm_blend_epi16(all, prev, 0xaa);
+      _mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all));
+      prev = all; 
+
+      if (j + 8 < imgWExtended)
+      {
+        j += 8;
+
+        const __m128i x0 = _mm_unpackhi_epi8(x00, _mm_setzero_si128());
+        const __m128i x1 = _mm_unpackhi_epi8(x01, _mm_setzero_si128());
+        const __m128i x2 = _mm_unpackhi_epi8(x02, _mm_setzero_si128());
+        const __m128i x3 = _mm_unpackhi_epi8(x03, _mm_setzero_si128());
+
+        const __m128i x4 = _mm_unpackhi_epi8(x04, _mm_setzero_si128());
+        const __m128i x5 = _mm_unpackhi_epi8(x05, _mm_setzero_si128());
+        const __m128i x6 = _mm_unpackhi_epi8(x06, _mm_setzero_si128());
+        const __m128i x7 = _mm_unpackhi_epi8(x07, _mm_setzero_si128());
+
+        const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa);
+        const __m128i n = _mm_blend_epi16(x0, x5, 0x55);
+        const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa);
+        const __m128i w = _mm_blend_epi16(x1, x2, 0xaa);
+        const __m128i e = _mm_blend_epi16(x5, x6, 0xaa);
+        const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa);
+        const __m128i s = _mm_blend_epi16(x2, x7, 0x55);
+        const __m128i se = _mm_blend_epi16(x6, x7, 0xaa);
+
+        __m128i c = _mm_blend_epi16(x1, x6, 0x55);
+        c = _mm_add_epi16(c, c);
+        __m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13));
+
+        const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s)));
+        const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e)));
+        const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se)));
+        const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw)));
+
+        const __m128i hv = _mm_hadd_epi16(ver, hor);
+        const __m128i di = _mm_hadd_epi16(di0, di1);
+        const __m128i all = _mm_hadd_epi16(hv, di);
+
+        const __m128i t = _mm_blend_epi16(all, prev, 0xaa);
+        _mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all));
+        prev = all;
+      }
+    }
+  }
+
+  for (int i = 0; i < (n_height >> 1); i += 4)
+  {
+    for (int j = 0; j < n_width; j += 8)
+    {
+      __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+      const uint32_t z = (2 * i + blk_pos_y) & (vb_ctu_height - 1);
+      const uint32_t z2 = (2 * i + 4 + blk_pos_y) & (vb_ctu_height - 1);
+
+      x0 = (z == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 0][j + 4]);
+      x1 = _mm_loadu_si128((__m128i *) &colSums[i + 1][j + 4]);
+      x2 = _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]);
+      x3 = (z == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]);
+
+      x4 = (z2 == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]);
+      x5 = _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]);
+      x6 = _mm_loadu_si128((__m128i *) &colSums[i + 4][j + 4]);
+      x7 = (z2 == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 5][j + 4]);
+
+      __m128i x0l = _mm_cvtepu16_epi32(x0);
+      __m128i x0h = _mm_unpackhi_epi16(x0, _mm_setzero_si128());
+      __m128i x1l = _mm_cvtepu16_epi32(x1);
+      __m128i x1h = _mm_unpackhi_epi16(x1, _mm_setzero_si128());
+      __m128i x2l = _mm_cvtepu16_epi32(x2);
+      __m128i x2h = _mm_unpackhi_epi16(x2, _mm_setzero_si128());
+      __m128i x3l = _mm_cvtepu16_epi32(x3);
+      __m128i x3h = _mm_unpackhi_epi16(x3, _mm_setzero_si128());
+      __m128i x4l = _mm_cvtepu16_epi32(x4);
+      __m128i x4h = _mm_unpackhi_epi16(x4, _mm_setzero_si128());
+      __m128i x5l = _mm_cvtepu16_epi32(x5);
+      __m128i x5h = _mm_unpackhi_epi16(x5, _mm_setzero_si128());
+      __m128i x6l = _mm_cvtepu16_epi32(x6);
+      __m128i x6h = _mm_unpackhi_epi16(x6, _mm_setzero_si128());
+      __m128i x7l = _mm_cvtepu16_epi32(x7);
+      __m128i x7h = _mm_unpackhi_epi16(x7, _mm_setzero_si128());
+
+      x0l = _mm_add_epi32(x0l, x1l);
+      x2l = _mm_add_epi32(x2l, x3l);
+      x4l = _mm_add_epi32(x4l, x5l);
+      x6l = _mm_add_epi32(x6l, x7l);
+      x0h = _mm_add_epi32(x0h, x1h);
+      x2h = _mm_add_epi32(x2h, x3h);
+      x4h = _mm_add_epi32(x4h, x5h);
+      x6h = _mm_add_epi32(x6h, x7h);
+
+      x0l = _mm_add_epi32(x0l, x2l);
+      x4l = _mm_add_epi32(x4l, x6l);
+      x0h = _mm_add_epi32(x0h, x2h);
+      x4h = _mm_add_epi32(x4h, x6h);
+
+      x2l = _mm_unpacklo_epi32(x0l, x4l);
+      x2h = _mm_unpackhi_epi32(x0l, x4l);
+      x6l = _mm_unpacklo_epi32(x0h, x4h);
+      x6h = _mm_unpackhi_epi32(x0h, x4h);
+
+      __m128i sumV  = _mm_unpacklo_epi32(x2l, x6l);
+      __m128i sumH  = _mm_unpackhi_epi32(x2l, x6l);
+      __m128i sumD0 = _mm_unpacklo_epi32(x2h, x6h);
+      __m128i sumD1 = _mm_unpackhi_epi32(x2h, x6h);
+
+      //      uint32_t tempAct = sumV + sumH;
+      __m128i tempAct = _mm_add_epi32(sumV, sumH);
+
+      //      const uint32_t activity = std::min<uint32_t>(15, tempAct * scale >> shift);
+      //      static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
+      //      uint8_t classIdx = th[activity];
+      const uint32_t scale  = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64;
+      const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64;
+      __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2)));
+      activity         = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift));
+      activity         = _mm_min_epi32(activity, _mm_set1_epi32(15));
+      __m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity);
+
+      //      if (sumV > sumH)
+      //      {
+      //        hv1       = sumV;
+      //        hv0       = sumH;
+      //        dirTempHV = 0;
+      //      }
+      //      else
+      //      {
+      //        hv1       = sumH;
+      //        hv0       = sumV;
+      //        dirTempHV = 1;
+      //      }
+      __m128i dirTempHVMinus1 = _mm_cmpgt_epi32(sumV, sumH);
+      __m128i hv1             = _mm_max_epi32(sumV, sumH);
+      __m128i hv0             = _mm_min_epi32(sumV, sumH);
+
+      //      if (sumD0 > sumD1)
+      //      {
+      //        d1       = sumD0;
+      //        d0       = sumD1;
+      //        dirTempD = 0;
+      //      }
+      //      else
+      //      {
+      //        d1       = sumD1;
+      //        d0       = sumD0;
+      //        dirTempD = 1;
+      //      }
+      __m128i dirTempDMinus1 = _mm_cmpgt_epi32(sumD0, sumD1);
+      __m128i d1             = _mm_max_epi32(sumD0, sumD1);
+      __m128i d0             = _mm_min_epi32(sumD0, sumD1);
+
+      //      int dirIdx;
+      //      if (d1 * hv0 > hv1 * d0)
+      //      {
+      //        hvd1   = d1;
+      //        hvd0   = d0;
+      //        dirIdx = 0;
+      //      }
+      //      else
+      //      {
+      //        hvd1   = hv1;
+      //        hvd0   = hv0;
+      //        dirIdx = 2;
+      //      }
+      __m128i a      = _mm_xor_si128(_mm_mullo_epi32(d1, hv0), _mm_set1_epi32(0x80000000));
+      __m128i b      = _mm_xor_si128(_mm_mullo_epi32(hv1, d0), _mm_set1_epi32(0x80000000));
+      __m128i dirIdx = _mm_cmpgt_epi32(a, b);
+      __m128i hvd1   = _mm_blendv_epi8(hv1, d1, dirIdx);
+      __m128i hvd0   = _mm_blendv_epi8(hv0, d0, dirIdx);
+
+      //      if (hvd1 * 2 > 9 * hvd0)
+      //      {
+      //        classIdx += (dirIdx + 2) * 5;
+      //      }
+      //      else if (hvd1 > 2 * hvd0)
+      //      {
+      //        classIdx += (dirIdx + 1) * 5;
+      //      }
+      __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0));
+      __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3)));
+      __m128i offset    = _mm_and_si128(strength1, _mm_set1_epi32(5));
+      classIdx          = _mm_add_epi32(classIdx, offset);
+      classIdx          = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5)));
+      offset            = _mm_andnot_si128(dirIdx, offset);
+      offset            = _mm_add_epi32(offset, offset);
+      classIdx          = _mm_add_epi32(classIdx, offset);
+
+      //      uint8_t transposeIdx = 2 * dirTempD + dirTempHV;
+      __m128i transposeIdx = _mm_set1_epi32(3);
+      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempHVMinus1);
+      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempDMinus1);
+      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempDMinus1);
+
+      int yOffset = 2 * i + blk_pos_y;
+      int xOffset = j + blk_pos_x;
+
+      static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide");
+      __m128i v;
+      v = _mm_unpacklo_epi8(classIdx, transposeIdx);
+      v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
+      _mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v);
+      v = _mm_unpackhi_epi8(classIdx, transposeIdx);
+      v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v);
+    }
+  }
+}
+
+#endif // KVZ_BIT_DEPTH == 8
+#endif //COMPILE_INTEL_SSE41
+
+
+int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+#if COMPILE_INTEL_SSE41
+#if KVZ_BIT_DEPTH == 8
+  if (bitdepth == 8){
+    success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41);
+  }
+#endif // KVZ_BIT_DEPTH == 8
+#endif
+  return success;
+}
diff --git a/src/strategies/sse41/alf-sse41.h b/src/strategies/sse41/alf-sse41.h
new file mode 100644
index 00000000..d41556e4
--- /dev/null
+++ b/src/strategies/sse41/alf-sse41.h
@@ -0,0 +1,32 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for SSE4.1.
+ */
+
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+
+int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth);
+
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
index 6bb7645e..537dadbe 100644
--- a/src/strategies/strategies-alf.c
+++ b/src/strategies/strategies-alf.c
@@ -19,7 +19,7 @@
  ****************************************************************************/
 
 #include "strategies/strategies-alf.h"
-//#include "strategies/avx2/alf-avx2.h"
+#include "strategies/sse41/alf-sse41.h"
 #include "strategies/generic/alf-generic.h"
 #include "strategyselector.h"
 
@@ -33,8 +33,8 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
 
   success &= kvz_strategy_register_alf_generic(opaque, bitdepth);
 
-  if (kvz_g_hardware_flags.intel_flags.avx2) {
-    //success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
+  if (kvz_g_hardware_flags.intel_flags.sse41) {
+    success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
   }
 
   return success;

From dc6a29b0d8f1280dc9212da2ccf3b206cc1c631c Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 25 Aug 2021 10:50:00 +0300
Subject: [PATCH 04/13] [alf] Initial generic strategies for 5x5 and 7x7
 filtering

---
 src/alf.c                            | 414 +-----------------------
 src/strategies/generic/alf-generic.c | 455 +++++++++++++++++++++++++++
 src/strategies/strategies-alf.c      |   3 +-
 src/strategies/strategies-alf.h      |  40 ++-
 4 files changed, 500 insertions(+), 412 deletions(-)

diff --git a/src/alf.c b/src/alf.c
index 8b2d876b..d591aaed 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -5339,412 +5339,6 @@ static void alf_encoder_ctb(encoder_state_t * const state,
   }
 }
 
-static void alf_filter_block(encoder_state_t * const state,
-  const kvz_pixel *src_pixels,
-  kvz_pixel *dst_pixels,
-  const int src_stride,
-  const int dst_stride,
-  const short* filter_set,
-  const int16_t *fClipSet,
-  clp_rng clp_rng,
-  alf_component_id component_id,
-  const int width,
-  const int height,
-  int x_pos,
-  int y_pos,
-  int blk_dst_x,
-  int blk_dst_y,
-  int vb_pos,
-  const int vb_ctu_height)
-{
-  alf_filter_type const filter_type = component_id == COMPONENT_Y ? ALF_FILTER_7X7 : ALF_FILTER_5X5;
-  const bool chroma = component_id == COMPONENT_Y ? 0 : 1;
-  const int8_t bit_depth = state->encoder_control->bitdepth;
-
-  if (chroma)
-  {
-    assert((int)filter_type == 0); //Chroma needs to have filtType == 0
-  }
-
-  const int start_height = y_pos;
-  const int end_height = start_height + height;
-  const int start_width = x_pos;
-  const int end_width = start_width + width;
-
-  const kvz_pixel *src = src_pixels;
-  kvz_pixel *dst = dst_pixels + blk_dst_y * dst_stride;
-
-  const kvz_pixel *p_img_y_pad_0, *p_img_y_pad_1, *p_img_y_pad_2, *p_img_y_pad_3, *p_img_y_pad_4, *p_img_y_pad_5, *p_img_y_pad_6;
-  const kvz_pixel *p_img_0, *p_img_1, *p_img_2, *p_img_3, *p_img_4, *p_img_5, *p_img_6;
-
-  const short *coef = filter_set;
-  const int16_t *clip = fClipSet;
-
-  const int shift = bit_depth - 1;
-
-  const int offset = 1 << (shift - 1);
-
-  int transpose_idx = 0;
-  const int cls_size_y = 4;
-  const int cls_size_x = 4;
-
-  assert((start_height % cls_size_y) == 0); //Wrong startHeight in filtering
-  assert((start_width % cls_size_x) == 0); //Wrong startWidth in filtering
-  assert(((end_height - start_height) % cls_size_y) == 0); //Wrong endHeight in filtering
-  assert(((end_width - start_width) % cls_size_x) == 0); //Wrong endWidth in filtering
-
-  alf_classifier *p_class = NULL;
-
-  int dst_stride2 = dst_stride * cls_size_y;
-  int src_stride2 = src_stride * cls_size_y;
-
-  //std::vector<Pel> filter_coeff(MAX_NUM_ALF_LUMA_COEFF);
-  int filter_coeff[MAX_NUM_ALF_LUMA_COEFF];
-  memset(filter_coeff, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int));
-  //std::array<int, MAX_NUM_ALF_LUMA_COEFF> filterClipp;
-  int filter_clipp[MAX_NUM_ALF_LUMA_COEFF];
-  memset(filter_clipp, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int));
-
-  p_img_y_pad_0 = src + start_height * src_stride + start_width;
-  p_img_y_pad_1 = p_img_y_pad_0 + src_stride;
-  p_img_y_pad_2 = p_img_y_pad_0 - src_stride;
-  p_img_y_pad_3 = p_img_y_pad_1 + src_stride;
-  p_img_y_pad_4 = p_img_y_pad_2 - src_stride;
-  p_img_y_pad_5 = p_img_y_pad_3 + src_stride;
-  p_img_y_pad_6 = p_img_y_pad_4 - src_stride;
-
-  kvz_pixel* p_rec_0 = dst + blk_dst_x;//start_width;
-  kvz_pixel* p_rec_1 = p_rec_0 + dst_stride;
-
-  for (int i = 0; i < end_height - start_height; i += cls_size_y)
-  {
-    if (!chroma)
-    {
-      p_class = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x;
-    }
-
-    for (int j = 0; j < end_width - start_width; j += cls_size_x)
-    {
-      if (!chroma)
-      {
-        alf_classifier cl = p_class[j];
-        transpose_idx = cl.transpose_idx;
-        coef = filter_set + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF;
-        clip = fClipSet + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF;
-      }
-
-      if (filter_type == ALF_FILTER_7X7)
-      {
-        if (transpose_idx == 1)
-        {
-          filter_coeff[0] = coef[9];
-          filter_coeff[1] = coef[4];
-          filter_coeff[2] = coef[10];
-          filter_coeff[3] = coef[8];
-          filter_coeff[4] = coef[1];
-          filter_coeff[5] = coef[5];
-          filter_coeff[6] = coef[11];
-          filter_coeff[7] = coef[7];
-          filter_coeff[8] = coef[3];
-          filter_coeff[9] = coef[0];
-          filter_coeff[10] = coef[2];
-          filter_coeff[11] = coef[6];
-          filter_coeff[12] = coef[12];
-
-          filter_clipp[0] = clip[9];
-          filter_clipp[1] = clip[4];
-          filter_clipp[2] = clip[10];
-          filter_clipp[3] = clip[8];
-          filter_clipp[4] = clip[1];
-          filter_clipp[5] = clip[5];
-          filter_clipp[6] = clip[11];
-          filter_clipp[7] = clip[7];
-          filter_clipp[8] = clip[3];
-          filter_clipp[9] = clip[0];
-          filter_clipp[10] = clip[2];
-          filter_clipp[11] = clip[6];
-          filter_clipp[12] = clip[12];
-        }
-        else if (transpose_idx == 2)
-        {
-          filter_coeff[0] = coef[0];
-          filter_coeff[1] = coef[3];
-          filter_coeff[2] = coef[2];
-          filter_coeff[3] = coef[1];
-          filter_coeff[4] = coef[8];
-          filter_coeff[5] = coef[7];
-          filter_coeff[6] = coef[6];
-          filter_coeff[7] = coef[5];
-          filter_coeff[8] = coef[4];
-          filter_coeff[9] = coef[9];
-          filter_coeff[10] = coef[10];
-          filter_coeff[11] = coef[11];
-          filter_coeff[12] = coef[12];
-
-          filter_clipp[0] = clip[0];
-          filter_clipp[1] = clip[3];
-          filter_clipp[2] = clip[2];
-          filter_clipp[3] = clip[1];
-          filter_clipp[4] = clip[8];
-          filter_clipp[5] = clip[7];
-          filter_clipp[6] = clip[6];
-          filter_clipp[7] = clip[5];
-          filter_clipp[8] = clip[4];
-          filter_clipp[9] = clip[9];
-          filter_clipp[10] = clip[10];
-          filter_clipp[11] = clip[11];
-          filter_clipp[12] = clip[12];
-
-        }
-        else if (transpose_idx == 3)
-        {
-          filter_coeff[0] = coef[9];
-          filter_coeff[1] = coef[8];
-          filter_coeff[2] = coef[10];
-          filter_coeff[3] = coef[4];
-          filter_coeff[4] = coef[3];
-          filter_coeff[5] = coef[7];
-          filter_coeff[6] = coef[11];
-          filter_coeff[7] = coef[5];
-          filter_coeff[8] = coef[1];
-          filter_coeff[9] = coef[0];
-          filter_coeff[10] = coef[2];
-          filter_coeff[11] = coef[6];
-          filter_coeff[12] = coef[12];
-
-          filter_clipp[0] = clip[9];
-          filter_clipp[1] = clip[8];
-          filter_clipp[2] = clip[10];
-          filter_clipp[3] = clip[4];
-          filter_clipp[4] = clip[3];
-          filter_clipp[5] = clip[7];
-          filter_clipp[6] = clip[11];
-          filter_clipp[7] = clip[5];
-          filter_clipp[8] = clip[1];
-          filter_clipp[9] = clip[0];
-          filter_clipp[10] = clip[2];
-          filter_clipp[11] = clip[6];
-          filter_clipp[12] = clip[12];
-        }
-        else
-        {
-          filter_coeff[0] = coef[0];
-          filter_coeff[1] = coef[1];
-          filter_coeff[2] = coef[2];
-          filter_coeff[3] = coef[3];
-          filter_coeff[4] = coef[4];
-          filter_coeff[5] = coef[5];
-          filter_coeff[6] = coef[6];
-          filter_coeff[7] = coef[7];
-          filter_coeff[8] = coef[8];
-          filter_coeff[9] = coef[9];
-          filter_coeff[10] = coef[10];
-          filter_coeff[11] = coef[11];
-          filter_coeff[12] = coef[12];
-
-          filter_clipp[0] = clip[0];
-          filter_clipp[1] = clip[1];
-          filter_clipp[2] = clip[2];
-          filter_clipp[3] = clip[3];
-          filter_clipp[4] = clip[4];
-          filter_clipp[5] = clip[5];
-          filter_clipp[6] = clip[6];
-          filter_clipp[7] = clip[7];
-          filter_clipp[8] = clip[8];
-          filter_clipp[9] = clip[9];
-          filter_clipp[10] = clip[10];
-          filter_clipp[11] = clip[11];
-          filter_clipp[12] = clip[12];
-        }
-      }
-      else
-      {
-        if (transpose_idx == 1)
-        {
-          filter_coeff[0] = coef[4];
-          filter_coeff[1] = coef[1];
-          filter_coeff[2] = coef[5];
-          filter_coeff[3] = coef[3];
-          filter_coeff[4] = coef[0];
-          filter_coeff[5] = coef[2];
-          filter_coeff[6] = coef[6];
-
-          filter_clipp[0] = clip[4];
-          filter_clipp[1] = clip[1];
-          filter_clipp[2] = clip[5];
-          filter_clipp[3] = clip[3];
-          filter_clipp[4] = clip[0];
-          filter_clipp[5] = clip[2];
-          filter_clipp[6] = clip[6];
-
-        }
-        else if (transpose_idx == 2)
-        {
-          filter_coeff[0] = coef[0];
-          filter_coeff[1] = coef[3];
-          filter_coeff[2] = coef[2];
-          filter_coeff[3] = coef[1];
-          filter_coeff[4] = coef[4];
-          filter_coeff[5] = coef[5];
-          filter_coeff[6] = coef[6];
-
-          filter_clipp[0] = clip[0];
-          filter_clipp[1] = clip[3];
-          filter_clipp[2] = clip[2];
-          filter_clipp[3] = clip[1];
-          filter_clipp[4] = clip[4];
-          filter_clipp[5] = clip[5];
-          filter_clipp[6] = clip[6];
-
-        }
-        else if (transpose_idx == 3)
-        {
-          filter_coeff[0] = coef[4];
-          filter_coeff[1] = coef[3];
-          filter_coeff[2] = coef[5];
-          filter_coeff[3] = coef[1];
-          filter_coeff[4] = coef[0];
-          filter_coeff[5] = coef[2];
-          filter_coeff[6] = coef[6];
-
-          filter_clipp[0] = clip[4];
-          filter_clipp[1] = clip[3];
-          filter_clipp[2] = clip[5];
-          filter_clipp[3] = clip[1];
-          filter_clipp[4] = clip[0];
-          filter_clipp[5] = clip[2];
-          filter_clipp[6] = clip[6];
-
-        }
-        else
-        {
-          filter_coeff[0] = coef[0];
-          filter_coeff[1] = coef[1];
-          filter_coeff[2] = coef[2];
-          filter_coeff[3] = coef[3];
-          filter_coeff[4] = coef[4];
-          filter_coeff[5] = coef[5];
-          filter_coeff[6] = coef[6];
-
-          filter_clipp[0] = clip[0];
-          filter_clipp[1] = clip[1];
-          filter_clipp[2] = clip[2];
-          filter_clipp[3] = clip[3];
-          filter_clipp[4] = clip[4];
-          filter_clipp[5] = clip[5];
-          filter_clipp[6] = clip[6];
-
-        }
-      }
-
-      for (int ii = 0; ii < cls_size_y; ii++)
-      {
-        p_img_0 = p_img_y_pad_0 + j + ii * src_stride;
-        p_img_1 = p_img_y_pad_1 + j + ii * src_stride;
-        p_img_2 = p_img_y_pad_2 + j + ii * src_stride;
-        p_img_3 = p_img_y_pad_3 + j + ii * src_stride;
-        p_img_4 = p_img_y_pad_4 + j + ii * src_stride;
-        p_img_5 = p_img_y_pad_5 + j + ii * src_stride;
-        p_img_6 = p_img_y_pad_6 + j + ii * src_stride;
-
-        p_rec_1 = p_rec_0 + j + ii * dst_stride;
-
-        const int y_vb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
-
-        if (y_vb < vb_pos && (y_vb >= vb_pos - (chroma ? 2 : 4)))   // above
-        {
-          p_img_1 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_1;
-          p_img_3 = (y_vb >= vb_pos - 2) ? p_img_1 : p_img_3;
-          p_img_5 = (y_vb >= vb_pos - 3) ? p_img_3 : p_img_5;
-
-          p_img_2 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_2;
-          p_img_4 = (y_vb >= vb_pos - 2) ? p_img_2 : p_img_4;
-          p_img_6 = (y_vb >= vb_pos - 3) ? p_img_4 : p_img_6;
-        }
-
-        else if (y_vb >= vb_pos && (y_vb <= vb_pos + (chroma ? 1 : 3)))   // bottom
-        {
-          p_img_2 = (y_vb == vb_pos) ? p_img_0 : p_img_2;
-          p_img_4 = (y_vb <= vb_pos + 1) ? p_img_2 : p_img_4;
-          p_img_6 = (y_vb <= vb_pos + 2) ? p_img_4 : p_img_6;
-
-          p_img_1 = (y_vb == vb_pos) ? p_img_0 : p_img_1;
-          p_img_3 = (y_vb <= vb_pos + 1) ? p_img_1 : p_img_3;
-          p_img_5 = (y_vb <= vb_pos + 2) ? p_img_3 : p_img_5;
-        }
-
-        bool is_near_vb_above = y_vb < vb_pos && (y_vb >= vb_pos - 1);
-        bool is_near_vb_below = y_vb >= vb_pos && (y_vb <= vb_pos);
-        for (int jj = 0; jj < cls_size_x; jj++)
-        {
-          int sum = 0;
-          const kvz_pixel curr = p_img_0[+0];
-
-          if (filter_type == ALF_FILTER_7X7)
-          {
-            sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_5[+0], p_img_6[+0]));
-
-            sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_3[+1], p_img_4[-1]));
-            sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_3[+0], p_img_4[+0]));
-            sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_3[-1], p_img_4[+1]));
-
-            sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_1[+2], p_img_2[-2]));
-            sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_1[+1], p_img_2[-1]));
-            sum += filter_coeff[6] * (clip_alf(filter_clipp[6], curr, p_img_1[+0], p_img_2[+0]));
-            sum += filter_coeff[7] * (clip_alf(filter_clipp[7], curr, p_img_1[-1], p_img_2[+1]));
-            sum += filter_coeff[8] * (clip_alf(filter_clipp[8], curr, p_img_1[-2], p_img_2[+2]));
-
-            sum += filter_coeff[9] * (clip_alf(filter_clipp[9], curr, p_img_0[+3], p_img_0[-3]));
-            sum += filter_coeff[10] * (clip_alf(filter_clipp[10], curr, p_img_0[+2], p_img_0[-2]));
-            sum += filter_coeff[11] * (clip_alf(filter_clipp[11], curr, p_img_0[+1], p_img_0[-1]));
-          }
-          else
-          {
-            sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_3[+0], p_img_4[+0]));
-
-            sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_1[+1], p_img_2[-1]));
-            sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_1[+0], p_img_2[+0]));
-            sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_1[-1], p_img_2[+1]));
-
-            sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_0[+2], p_img_0[-2]));
-            sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_0[+1], p_img_0[-1]));
-          }
-
-          if (!(is_near_vb_above || is_near_vb_below))
-          {
-            sum = (sum + offset) >> shift;
-          }
-          else
-          {
-            sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3);
-          }
-          sum += curr;
-
-          p_rec_1[jj] = kvz_fast_clip_32bit_to_pixel(sum);
-
-          p_img_0++;
-          p_img_1++;
-          p_img_2++;
-          p_img_3++;
-          p_img_4++;
-          p_img_5++;
-          p_img_6++;
-        }
-      }
-    }
-
-    p_rec_0 += dst_stride2;
-    p_rec_1 += dst_stride2;
-
-    p_img_y_pad_0 += src_stride2;
-    p_img_y_pad_1 += src_stride2;
-    p_img_y_pad_2 += src_stride2;
-    p_img_y_pad_3 += src_stride2;
-    p_img_y_pad_4 += src_stride2;
-    p_img_y_pad_5 += src_stride2;
-    p_img_y_pad_6 += src_stride2;
-  }
-}
 
 static void alf_reconstruct(encoder_state_t * const state,
   array_variables *arr_vars)
@@ -5819,10 +5413,10 @@ static void alf_reconstruct(encoder_state_t * const state,
             coeff = arr_vars->fixed_filter_set_coeff_dec[filter_set_index];
             clip = arr_vars->clip_default;
           }
-          alf_filter_block(state,
+          kvz_alf_filter_7x7_blk(state,
             alf_info->alf_tmp_y, state->tile->frame->rec->y,
             luma_stride, luma_stride,
-            coeff, clip, arr_vars->clp_rngs.comp[COMPONENT_Y], COMPONENT_Y,
+            coeff, clip, arr_vars->clp_rngs.comp[COMPONENT_Y],
             width, height, x_pos, y_pos, x_pos, y_pos,
             alf_vb_luma_pos, alf_vb_luma_ctu_height);
         }
@@ -5836,10 +5430,10 @@ static void alf_reconstruct(encoder_state_t * const state,
             const kvz_pixel *src_pixels = comp_id - 1 ? alf_info->alf_tmp_v : alf_info->alf_tmp_u;
 
             const int alt_num = alf_info->ctu_alternative[comp_id][ctu_idx];
-            alf_filter_block(state,
+            kvz_alf_filter_5x5_blk(state,
               src_pixels, dst_pixels,
               chroma_stride, chroma_stride,
-              arr_vars->chroma_coeff_final[alt_num], arr_vars->chroma_clipp_final[alt_num], arr_vars->clp_rngs.comp[comp_idx], comp_idx,
+              arr_vars->chroma_coeff_final[alt_num], arr_vars->chroma_clipp_final[alt_num], arr_vars->clp_rngs.comp[comp_idx],
               width >> chroma_scale_x, height >> chroma_scale_y,
               x_pos >> chroma_scale_x, y_pos >> chroma_scale_y,
               x_pos >> chroma_scale_x, y_pos >> chroma_scale_y,
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
index 35d622a7..d9aa3b95 100644
--- a/src/strategies/generic/alf-generic.c
+++ b/src/strategies/generic/alf-generic.c
@@ -27,6 +27,12 @@
 #include "alf.h"
 #include "strategyselector.h"
 
+extern kvz_pixel kvz_fast_clip_32bit_to_pixel(int32_t value);
+
+static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
+{
+  return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
+}
 
 static void alf_derive_classification_blk_generic(encoder_state_t * const state,
   const int shift,
@@ -269,13 +275,462 @@ static void alf_derive_classification_blk_generic(encoder_state_t * const state,
   }
 }
 
+static void alf_filter_block_generic(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  alf_component_id component_id,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height)
+{
+  alf_filter_type const filter_type = component_id == COMPONENT_Y ? ALF_FILTER_7X7 : ALF_FILTER_5X5;
+  const bool chroma = component_id == COMPONENT_Y ? 0 : 1;
+  const int8_t bit_depth = state->encoder_control->bitdepth;
 
+  if (chroma)
+  {
+    assert((int)filter_type == 0); //Chroma needs to have filtType == 0
+  }
+
+  const int start_height = y_pos;
+  const int end_height = start_height + height;
+  const int start_width = x_pos;
+  const int end_width = start_width + width;
+
+  const kvz_pixel* src = src_pixels;
+  kvz_pixel* dst = dst_pixels + blk_dst_y * dst_stride;
+
+  const kvz_pixel* p_img_y_pad_0, * p_img_y_pad_1, * p_img_y_pad_2, * p_img_y_pad_3, * p_img_y_pad_4, * p_img_y_pad_5, * p_img_y_pad_6;
+  const kvz_pixel* p_img_0, * p_img_1, * p_img_2, * p_img_3, * p_img_4, * p_img_5, * p_img_6;
+
+  const short* coef = filter_set;
+  const int16_t* clip = fClipSet;
+
+  const int shift = bit_depth - 1;
+
+  const int offset = 1 << (shift - 1);
+
+  int transpose_idx = 0;
+  const int cls_size_y = 4;
+  const int cls_size_x = 4;
+
+  assert((start_height % cls_size_y) == 0); //Wrong startHeight in filtering
+  assert((start_width % cls_size_x) == 0); //Wrong startWidth in filtering
+  assert(((end_height - start_height) % cls_size_y) == 0); //Wrong endHeight in filtering
+  assert(((end_width - start_width) % cls_size_x) == 0); //Wrong endWidth in filtering
+
+  alf_classifier* p_class = NULL;
+
+  int dst_stride2 = dst_stride * cls_size_y;
+  int src_stride2 = src_stride * cls_size_y;
+
+  //std::vector<Pel> filter_coeff(MAX_NUM_ALF_LUMA_COEFF);
+  int filter_coeff[MAX_NUM_ALF_LUMA_COEFF];
+  memset(filter_coeff, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int));
+  //std::array<int, MAX_NUM_ALF_LUMA_COEFF> filterClipp;
+  int filter_clipp[MAX_NUM_ALF_LUMA_COEFF];
+  memset(filter_clipp, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int));
+
+  p_img_y_pad_0 = src + start_height * src_stride + start_width;
+  p_img_y_pad_1 = p_img_y_pad_0 + src_stride;
+  p_img_y_pad_2 = p_img_y_pad_0 - src_stride;
+  p_img_y_pad_3 = p_img_y_pad_1 + src_stride;
+  p_img_y_pad_4 = p_img_y_pad_2 - src_stride;
+  p_img_y_pad_5 = p_img_y_pad_3 + src_stride;
+  p_img_y_pad_6 = p_img_y_pad_4 - src_stride;
+
+  kvz_pixel* p_rec_0 = dst + blk_dst_x;//start_width;
+  kvz_pixel* p_rec_1 = p_rec_0 + dst_stride;
+
+  for (int i = 0; i < end_height - start_height; i += cls_size_y)
+  {
+    if (!chroma)
+    {
+      p_class = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x;
+    }
+
+    for (int j = 0; j < end_width - start_width; j += cls_size_x)
+    {
+      if (!chroma)
+      {
+        alf_classifier cl = p_class[j];
+        transpose_idx = cl.transpose_idx;
+        coef = filter_set + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF;
+        clip = fClipSet + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF;
+      }
+
+      if (filter_type == ALF_FILTER_7X7)
+      {
+        if (transpose_idx == 1)
+        {
+          filter_coeff[0] = coef[9];
+          filter_coeff[1] = coef[4];
+          filter_coeff[2] = coef[10];
+          filter_coeff[3] = coef[8];
+          filter_coeff[4] = coef[1];
+          filter_coeff[5] = coef[5];
+          filter_coeff[6] = coef[11];
+          filter_coeff[7] = coef[7];
+          filter_coeff[8] = coef[3];
+          filter_coeff[9] = coef[0];
+          filter_coeff[10] = coef[2];
+          filter_coeff[11] = coef[6];
+          filter_coeff[12] = coef[12];
+
+          filter_clipp[0] = clip[9];
+          filter_clipp[1] = clip[4];
+          filter_clipp[2] = clip[10];
+          filter_clipp[3] = clip[8];
+          filter_clipp[4] = clip[1];
+          filter_clipp[5] = clip[5];
+          filter_clipp[6] = clip[11];
+          filter_clipp[7] = clip[7];
+          filter_clipp[8] = clip[3];
+          filter_clipp[9] = clip[0];
+          filter_clipp[10] = clip[2];
+          filter_clipp[11] = clip[6];
+          filter_clipp[12] = clip[12];
+        }
+        else if (transpose_idx == 2)
+        {
+          filter_coeff[0] = coef[0];
+          filter_coeff[1] = coef[3];
+          filter_coeff[2] = coef[2];
+          filter_coeff[3] = coef[1];
+          filter_coeff[4] = coef[8];
+          filter_coeff[5] = coef[7];
+          filter_coeff[6] = coef[6];
+          filter_coeff[7] = coef[5];
+          filter_coeff[8] = coef[4];
+          filter_coeff[9] = coef[9];
+          filter_coeff[10] = coef[10];
+          filter_coeff[11] = coef[11];
+          filter_coeff[12] = coef[12];
+
+          filter_clipp[0] = clip[0];
+          filter_clipp[1] = clip[3];
+          filter_clipp[2] = clip[2];
+          filter_clipp[3] = clip[1];
+          filter_clipp[4] = clip[8];
+          filter_clipp[5] = clip[7];
+          filter_clipp[6] = clip[6];
+          filter_clipp[7] = clip[5];
+          filter_clipp[8] = clip[4];
+          filter_clipp[9] = clip[9];
+          filter_clipp[10] = clip[10];
+          filter_clipp[11] = clip[11];
+          filter_clipp[12] = clip[12];
+
+        }
+        else if (transpose_idx == 3)
+        {
+          filter_coeff[0] = coef[9];
+          filter_coeff[1] = coef[8];
+          filter_coeff[2] = coef[10];
+          filter_coeff[3] = coef[4];
+          filter_coeff[4] = coef[3];
+          filter_coeff[5] = coef[7];
+          filter_coeff[6] = coef[11];
+          filter_coeff[7] = coef[5];
+          filter_coeff[8] = coef[1];
+          filter_coeff[9] = coef[0];
+          filter_coeff[10] = coef[2];
+          filter_coeff[11] = coef[6];
+          filter_coeff[12] = coef[12];
+
+          filter_clipp[0] = clip[9];
+          filter_clipp[1] = clip[8];
+          filter_clipp[2] = clip[10];
+          filter_clipp[3] = clip[4];
+          filter_clipp[4] = clip[3];
+          filter_clipp[5] = clip[7];
+          filter_clipp[6] = clip[11];
+          filter_clipp[7] = clip[5];
+          filter_clipp[8] = clip[1];
+          filter_clipp[9] = clip[0];
+          filter_clipp[10] = clip[2];
+          filter_clipp[11] = clip[6];
+          filter_clipp[12] = clip[12];
+        }
+        else
+        {
+          filter_coeff[0] = coef[0];
+          filter_coeff[1] = coef[1];
+          filter_coeff[2] = coef[2];
+          filter_coeff[3] = coef[3];
+          filter_coeff[4] = coef[4];
+          filter_coeff[5] = coef[5];
+          filter_coeff[6] = coef[6];
+          filter_coeff[7] = coef[7];
+          filter_coeff[8] = coef[8];
+          filter_coeff[9] = coef[9];
+          filter_coeff[10] = coef[10];
+          filter_coeff[11] = coef[11];
+          filter_coeff[12] = coef[12];
+
+          filter_clipp[0] = clip[0];
+          filter_clipp[1] = clip[1];
+          filter_clipp[2] = clip[2];
+          filter_clipp[3] = clip[3];
+          filter_clipp[4] = clip[4];
+          filter_clipp[5] = clip[5];
+          filter_clipp[6] = clip[6];
+          filter_clipp[7] = clip[7];
+          filter_clipp[8] = clip[8];
+          filter_clipp[9] = clip[9];
+          filter_clipp[10] = clip[10];
+          filter_clipp[11] = clip[11];
+          filter_clipp[12] = clip[12];
+        }
+      }
+      else
+      {
+        if (transpose_idx == 1)
+        {
+          filter_coeff[0] = coef[4];
+          filter_coeff[1] = coef[1];
+          filter_coeff[2] = coef[5];
+          filter_coeff[3] = coef[3];
+          filter_coeff[4] = coef[0];
+          filter_coeff[5] = coef[2];
+          filter_coeff[6] = coef[6];
+
+          filter_clipp[0] = clip[4];
+          filter_clipp[1] = clip[1];
+          filter_clipp[2] = clip[5];
+          filter_clipp[3] = clip[3];
+          filter_clipp[4] = clip[0];
+          filter_clipp[5] = clip[2];
+          filter_clipp[6] = clip[6];
+
+        }
+        else if (transpose_idx == 2)
+        {
+          filter_coeff[0] = coef[0];
+          filter_coeff[1] = coef[3];
+          filter_coeff[2] = coef[2];
+          filter_coeff[3] = coef[1];
+          filter_coeff[4] = coef[4];
+          filter_coeff[5] = coef[5];
+          filter_coeff[6] = coef[6];
+
+          filter_clipp[0] = clip[0];
+          filter_clipp[1] = clip[3];
+          filter_clipp[2] = clip[2];
+          filter_clipp[3] = clip[1];
+          filter_clipp[4] = clip[4];
+          filter_clipp[5] = clip[5];
+          filter_clipp[6] = clip[6];
+
+        }
+        else if (transpose_idx == 3)
+        {
+          filter_coeff[0] = coef[4];
+          filter_coeff[1] = coef[3];
+          filter_coeff[2] = coef[5];
+          filter_coeff[3] = coef[1];
+          filter_coeff[4] = coef[0];
+          filter_coeff[5] = coef[2];
+          filter_coeff[6] = coef[6];
+
+          filter_clipp[0] = clip[4];
+          filter_clipp[1] = clip[3];
+          filter_clipp[2] = clip[5];
+          filter_clipp[3] = clip[1];
+          filter_clipp[4] = clip[0];
+          filter_clipp[5] = clip[2];
+          filter_clipp[6] = clip[6];
+
+        }
+        else
+        {
+          filter_coeff[0] = coef[0];
+          filter_coeff[1] = coef[1];
+          filter_coeff[2] = coef[2];
+          filter_coeff[3] = coef[3];
+          filter_coeff[4] = coef[4];
+          filter_coeff[5] = coef[5];
+          filter_coeff[6] = coef[6];
+
+          filter_clipp[0] = clip[0];
+          filter_clipp[1] = clip[1];
+          filter_clipp[2] = clip[2];
+          filter_clipp[3] = clip[3];
+          filter_clipp[4] = clip[4];
+          filter_clipp[5] = clip[5];
+          filter_clipp[6] = clip[6];
+
+        }
+      }
+
+      for (int ii = 0; ii < cls_size_y; ii++)
+      {
+        p_img_0 = p_img_y_pad_0 + j + ii * src_stride;
+        p_img_1 = p_img_y_pad_1 + j + ii * src_stride;
+        p_img_2 = p_img_y_pad_2 + j + ii * src_stride;
+        p_img_3 = p_img_y_pad_3 + j + ii * src_stride;
+        p_img_4 = p_img_y_pad_4 + j + ii * src_stride;
+        p_img_5 = p_img_y_pad_5 + j + ii * src_stride;
+        p_img_6 = p_img_y_pad_6 + j + ii * src_stride;
+
+        p_rec_1 = p_rec_0 + j + ii * dst_stride;
+
+        const int y_vb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
+
+        if (y_vb < vb_pos && (y_vb >= vb_pos - (chroma ? 2 : 4)))   // above
+        {
+          p_img_1 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_1;
+          p_img_3 = (y_vb >= vb_pos - 2) ? p_img_1 : p_img_3;
+          p_img_5 = (y_vb >= vb_pos - 3) ? p_img_3 : p_img_5;
+
+          p_img_2 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_2;
+          p_img_4 = (y_vb >= vb_pos - 2) ? p_img_2 : p_img_4;
+          p_img_6 = (y_vb >= vb_pos - 3) ? p_img_4 : p_img_6;
+        }
+
+        else if (y_vb >= vb_pos && (y_vb <= vb_pos + (chroma ? 1 : 3)))   // bottom
+        {
+          p_img_2 = (y_vb == vb_pos) ? p_img_0 : p_img_2;
+          p_img_4 = (y_vb <= vb_pos + 1) ? p_img_2 : p_img_4;
+          p_img_6 = (y_vb <= vb_pos + 2) ? p_img_4 : p_img_6;
+
+          p_img_1 = (y_vb == vb_pos) ? p_img_0 : p_img_1;
+          p_img_3 = (y_vb <= vb_pos + 1) ? p_img_1 : p_img_3;
+          p_img_5 = (y_vb <= vb_pos + 2) ? p_img_3 : p_img_5;
+        }
+
+        bool is_near_vb_above = y_vb < vb_pos && (y_vb >= vb_pos - 1);
+        bool is_near_vb_below = y_vb >= vb_pos && (y_vb <= vb_pos);
+        for (int jj = 0; jj < cls_size_x; jj++)
+        {
+          int sum = 0;
+          const kvz_pixel curr = p_img_0[+0];
+
+          if (filter_type == ALF_FILTER_7X7)
+          {
+            sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_5[+0], p_img_6[+0]));
+
+            sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_3[+1], p_img_4[-1]));
+            sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_3[+0], p_img_4[+0]));
+            sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_3[-1], p_img_4[+1]));
+
+            sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_1[+2], p_img_2[-2]));
+            sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_1[+1], p_img_2[-1]));
+            sum += filter_coeff[6] * (clip_alf(filter_clipp[6], curr, p_img_1[+0], p_img_2[+0]));
+            sum += filter_coeff[7] * (clip_alf(filter_clipp[7], curr, p_img_1[-1], p_img_2[+1]));
+            sum += filter_coeff[8] * (clip_alf(filter_clipp[8], curr, p_img_1[-2], p_img_2[+2]));
+
+            sum += filter_coeff[9] * (clip_alf(filter_clipp[9], curr, p_img_0[+3], p_img_0[-3]));
+            sum += filter_coeff[10] * (clip_alf(filter_clipp[10], curr, p_img_0[+2], p_img_0[-2]));
+            sum += filter_coeff[11] * (clip_alf(filter_clipp[11], curr, p_img_0[+1], p_img_0[-1]));
+          }
+          else
+          {
+            sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_3[+0], p_img_4[+0]));
+
+            sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_1[+1], p_img_2[-1]));
+            sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_1[+0], p_img_2[+0]));
+            sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_1[-1], p_img_2[+1]));
+
+            sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_0[+2], p_img_0[-2]));
+            sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_0[+1], p_img_0[-1]));
+          }
+
+          if (!(is_near_vb_above || is_near_vb_below))
+          {
+            sum = (sum + offset) >> shift;
+          }
+          else
+          {
+            sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3);
+          }
+          sum += curr;
+
+          p_rec_1[jj] = kvz_fast_clip_32bit_to_pixel(sum);
+
+          p_img_0++;
+          p_img_1++;
+          p_img_2++;
+          p_img_3++;
+          p_img_4++;
+          p_img_5++;
+          p_img_6++;
+        }
+      }
+    }
+
+    p_rec_0 += dst_stride2;
+    p_rec_1 += dst_stride2;
+
+    p_img_y_pad_0 += src_stride2;
+    p_img_y_pad_1 += src_stride2;
+    p_img_y_pad_2 += src_stride2;
+    p_img_y_pad_3 += src_stride2;
+    p_img_y_pad_4 += src_stride2;
+    p_img_y_pad_5 += src_stride2;
+    p_img_y_pad_6 += src_stride2;
+  }
+}
+
+
+static void alf_filter_5x5_block_generic(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height)
+{
+  alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, 
+    filter_set, fClipSet, clp_rng, COMPONENT_Cb, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
+}
+
+static void alf_filter_7x7_block_generic(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height)
+{
+  alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
+}
 
 int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
 
   success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
+  success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic);
+  success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic);
 
   return success;
 }
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
index 537dadbe..d5d99936 100644
--- a/src/strategies/strategies-alf.c
+++ b/src/strategies/strategies-alf.c
@@ -26,7 +26,8 @@
 
 // Define function pointers.
 alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
-
+alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
+alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
   bool success = true;
diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h
index 8c9c2924..8ad15384 100644
--- a/src/strategies/strategies-alf.h
+++ b/src/strategies/strategies-alf.h
@@ -22,7 +22,7 @@
 /**
  * \ingroup Optimization
  * \file
- * Interface for sao functions.
+ * Interface for alf functions.
  */
 
 #include "encoder.h"
@@ -44,13 +44,51 @@ typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state,
   const int vb_ctu_height,
   int vb_pos);
 
+typedef void (alf_filter_5x5_blk_func)(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height);
+
+typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height);
+
 // Declare function pointers.
 extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
+extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
+extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
 
 
 #define STRATEGIES_ALF_EXPORTS \
   {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
+  {"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
+  {"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
  
 

From f61b9138cd4d9acf65f2b4f104dd6d6c6dee1635 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 25 Aug 2021 11:36:04 +0300
Subject: [PATCH 05/13] [alf] Import SSE4.1 optimized 5x5 and 7x7 filters from
 VTM13   * Modified to work with 8-bit pixels

---
 src/strategies/sse41/alf-sse41.c | 452 +++++++++++++++++++++++++++++--
 1 file changed, 429 insertions(+), 23 deletions(-)

diff --git a/src/strategies/sse41/alf-sse41.c b/src/strategies/sse41/alf-sse41.c
index 2d7ded14..a803e4cc 100644
--- a/src/strategies/sse41/alf-sse41.c
+++ b/src/strategies/sse41/alf-sse41.c
@@ -236,13 +236,13 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
 
       //      const uint32_t activity = std::min<uint32_t>(15, tempAct * scale >> shift);
       //      static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
-      //      uint8_t classIdx = th[activity];
+      //      uint8_t class_idx = th[activity];
       const uint32_t scale  = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64;
       const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64;
       __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2)));
       activity         = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift));
       activity         = _mm_min_epi32(activity, _mm_set1_epi32(15));
-      __m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity);
+      __m128i class_idx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity);
 
       //      if (sumV > sumH)
       //      {
@@ -297,48 +297,452 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
 
       //      if (hvd1 * 2 > 9 * hvd0)
       //      {
-      //        classIdx += (dirIdx + 2) * 5;
+      //        class_idx += (dirIdx + 2) * 5;
       //      }
       //      else if (hvd1 > 2 * hvd0)
       //      {
-      //        classIdx += (dirIdx + 1) * 5;
+      //        class_idx += (dirIdx + 1) * 5;
       //      }
       __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0));
       __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3)));
       __m128i offset    = _mm_and_si128(strength1, _mm_set1_epi32(5));
-      classIdx          = _mm_add_epi32(classIdx, offset);
-      classIdx          = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5)));
+      class_idx          = _mm_add_epi32(class_idx, offset);
+      class_idx          = _mm_add_epi32(class_idx, _mm_and_si128(strength2, _mm_set1_epi32(5)));
       offset            = _mm_andnot_si128(dirIdx, offset);
       offset            = _mm_add_epi32(offset, offset);
-      classIdx          = _mm_add_epi32(classIdx, offset);
+      class_idx          = _mm_add_epi32(class_idx, offset);
 
-      //      uint8_t transposeIdx = 2 * dirTempD + dirTempHV;
-      __m128i transposeIdx = _mm_set1_epi32(3);
-      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempHVMinus1);
-      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempDMinus1);
-      transposeIdx         = _mm_add_epi32(transposeIdx, dirTempDMinus1);
+      //      uint8_t transpose_idx = 2 * dirTempD + dirTempHV;
+      __m128i transpose_idx = _mm_set1_epi32(3);
+      transpose_idx         = _mm_add_epi32(transpose_idx, dirTempHVMinus1);
+      transpose_idx         = _mm_add_epi32(transpose_idx, dirTempDMinus1);
+      transpose_idx         = _mm_add_epi32(transpose_idx, dirTempDMinus1);
 
       int yOffset = 2 * i + blk_pos_y;
       int xOffset = j + blk_pos_x;
 
-      static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide");
+      static_assert(sizeof(alf_classifier) == 2, "alf_classifier type must be 16 bits wide");
       __m128i v;
-      v = _mm_unpacklo_epi8(classIdx, transposeIdx);
+      v = _mm_unpacklo_epi8(class_idx, transpose_idx);
       v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
-      _mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v);
-      v = _mm_unpackhi_epi8(classIdx, transposeIdx);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 1] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 2] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 3] + xOffset), v);
+      v = _mm_unpackhi_epi8(class_idx, transpose_idx);
       v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v);
-      _mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 4] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 5] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 6] + xOffset), v);
+      _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 7] + xOffset), v);
     }
   }
 }
 
+
+INLINE static void process2coeffs_5x5(__m128i params[2][3], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
+  const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
+  const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
+  const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
+  const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
+  __m128i val01A = _mm_unpacklo_epi16(val00, val10);
+  __m128i val01B = _mm_unpackhi_epi16(val00, val10);
+  __m128i val01C = _mm_unpacklo_epi16(val01, val11);
+  __m128i val01D = _mm_unpackhi_epi16(val01, val11);
+
+  __m128i limit01A = params[1][i];
+
+  val01A = _mm_min_epi16(val01A, limit01A);
+  val01B = _mm_min_epi16(val01B, limit01A);
+  val01C = _mm_min_epi16(val01C, limit01A);
+  val01D = _mm_min_epi16(val01D, limit01A);
+
+  limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
+
+  val01A = _mm_max_epi16(val01A, limit01A);
+  val01B = _mm_max_epi16(val01B, limit01A);
+  val01C = _mm_max_epi16(val01C, limit01A);
+  val01D = _mm_max_epi16(val01D, limit01A);
+
+  val01A = _mm_add_epi16(val01A, val01C);
+  val01B = _mm_add_epi16(val01B, val01D);
+
+  __m128i coeff01A = params[0][i];
+
+  *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
+  *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01A));
+};
+
+
+static void alf_filter_5x5_block_sse41(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height)
+{
+
+
+  assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
+  
+  alf_component_id compId = COMPONENT_Cb;
+
+  const size_t srcStride = src_stride;
+  const size_t dstStride = dst_stride;
+
+  const int SHIFT = state->encoder_control->bitdepth - 1;
+  const int ROUND = 1 << (SHIFT - 1);
+  const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
+
+  const size_t STEP_X = 8;
+  const size_t STEP_Y = 4;
+
+  assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
+  assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
+  assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
+  assert(width % 4 == 0 && "Wrong endWidth in filtering");
+
+  const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
+  kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
+
+
+
+  const __m128i mmOffset = _mm_set1_epi32(ROUND);
+  const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
+  const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
+
+  __m128i params[2][3];
+  __m128i fs = _mm_loadu_si128((__m128i*) filter_set);
+  params[0][0] = _mm_shuffle_epi32(fs, 0x00);
+  params[0][1] = _mm_shuffle_epi32(fs, 0x55);
+  params[0][2] = _mm_shuffle_epi32(fs, 0xaa);
+  __m128i fc = _mm_loadu_si128((__m128i*) fClipSet);
+  params[1][0] = _mm_shuffle_epi32(fc, 0x00);
+  params[1][1] = _mm_shuffle_epi32(fc, 0x55);
+  params[1][2] = _mm_shuffle_epi32(fc, 0xaa);
+
+  const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for (size_t i = 0; i < height; i += STEP_Y)
+  {
+    for (size_t j = 0; j < width; j += STEP_X)
+    {
+
+      for (size_t ii = 0; ii < STEP_Y; ii++)
+      {
+        const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4;
+
+        pImg0 = src + j + ii * srcStride;
+        pImg1 = pImg0 + srcStride;
+        pImg2 = pImg0 - srcStride;
+        pImg3 = pImg1 + srcStride;
+        pImg4 = pImg2 - srcStride;
+
+        const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
+        if (yVb < vb_pos && (yVb >= vb_pos - 2))   // above
+        {
+          pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
+          pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
+
+          pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
+          pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
+        }
+        else if (yVb >= vb_pos && (yVb <= vb_pos + 1))   // bottom
+        {
+          pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
+          pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
+
+          pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
+          pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
+        }
+        __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
+        
+        __m128i accumA = mmOffset;
+        __m128i accumB = mmOffset;
+
+        
+
+        process2coeffs_5x5(params, &cur, &accumA, &accumB, 0, pImg3 + 0, pImg4 + 0, pImg1 + 1, pImg2 - 1);
+        process2coeffs_5x5(params, &cur, &accumA, &accumB, 1, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
+        process2coeffs_5x5(params, &cur, &accumA, &accumB, 2, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
+        bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
+        bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
+        if (!(isNearVBabove || isNearVBbelow))
+        {
+          accumA = _mm_srai_epi32(accumA, SHIFT);
+          accumB = _mm_srai_epi32(accumB, SHIFT);
+        }
+        else
+        {
+          accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
+          accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
+        }
+        accumA = _mm_packs_epi32(accumA, accumB);
+        accumA = _mm_add_epi16(accumA, cur);
+        accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));
+        
+        if (j + STEP_X <= width)
+        {
+          //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
+          _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
+        }
+        else
+        {
+          //_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), accumA);
+          _mm_store_ss((float*) (dst + ii * dstStride + j), _mm_castsi128_ps(_mm_shuffle_epi8(accumA, mask)));
+        }
+      }
+
+    }
+
+    src += srcStride * STEP_Y;
+    dst += dstStride * STEP_Y;
+  }
+}
+
+#define sh(x) 0x0202 * (x & 7) + 0x0100 + 0x1010 * (x & 8)
+
+static const uint16_t shuffleTab[4][2][8] = {
+  {
+    { sh(0), sh(1), sh(2), sh(3), sh(4), sh(5), sh(6), sh(7) },
+    { sh(8), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
+  },
+  {
+    { sh(9), sh(4), sh(10), sh(8), sh(1), sh(5), sh(11), sh(7) },
+    { sh(3), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
+  },
+  {
+    { sh(0), sh(3), sh(2), sh(1), sh(8), sh(7), sh(6), sh(5) },
+    { sh(4), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
+  },
+  {
+    { sh(9), sh(8), sh(10), sh(4), sh(3), sh(7), sh(11), sh(5) },
+    { sh(1), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
+  },
+};
+
+
+
+INLINE static void process2coeffs_7x7(__m128i params[2][2][6], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
+  const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
+  const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
+  const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
+  const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
+
+  __m128i val01A = _mm_unpacklo_epi16(val00, val10);
+  __m128i val01B = _mm_unpackhi_epi16(val00, val10);
+  __m128i val01C = _mm_unpacklo_epi16(val01, val11);
+  __m128i val01D = _mm_unpackhi_epi16(val01, val11);
+
+  __m128i limit01A = params[0][1][i];
+  __m128i limit01B = params[1][1][i];
+
+  val01A = _mm_min_epi16(val01A, limit01A);
+  val01B = _mm_min_epi16(val01B, limit01B);
+  val01C = _mm_min_epi16(val01C, limit01A);
+  val01D = _mm_min_epi16(val01D, limit01B);
+
+  limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
+  limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B);
+
+  val01A = _mm_max_epi16(val01A, limit01A);
+  val01B = _mm_max_epi16(val01B, limit01B);
+  val01C = _mm_max_epi16(val01C, limit01A);
+  val01D = _mm_max_epi16(val01D, limit01B);
+
+  val01A = _mm_add_epi16(val01A, val01C);
+  val01B = _mm_add_epi16(val01B, val01D);
+
+  const __m128i coeff01A = params[0][0][i];
+  const __m128i coeff01B = params[1][0][i];
+
+  *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
+  *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01B));
+};
+
+
+
+
+static void alf_filter_7x7_block_sse41(encoder_state_t* const state,
+  const kvz_pixel* src_pixels,
+  kvz_pixel* dst_pixels,
+  const int src_stride,
+  const int dst_stride,
+  const short* filter_set,
+  const int16_t* fClipSet,
+  clp_rng clp_rng,
+  const int width,
+  const int height,
+  int x_pos,
+  int y_pos,
+  int blk_dst_x,
+  int blk_dst_y,
+  int vb_pos,
+  const int vb_ctu_height)
+{
+  assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
+  alf_component_id compId = COMPONENT_Y;
+
+
+  const size_t srcStride = src_stride;
+  const size_t dstStride = dst_stride;
+
+  const int SHIFT = state->encoder_control->bitdepth - 1;
+  const int ROUND = 1 << (SHIFT - 1);
+
+  const size_t STEP_X = 8;
+  const size_t STEP_Y = 4;
+
+  assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
+  assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
+  assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
+  assert(width % STEP_X == 0 && "Wrong endWidth in filtering");
+
+  const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
+  kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
+
+  const __m128i mmOffset = _mm_set1_epi32(ROUND);
+  const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
+  const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
+  const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
+
+  const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for (size_t i = 0; i < height; i += STEP_Y)
+  {
+    const alf_classifier* pClass = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x;
+
+    for (size_t j = 0; j < width; j += STEP_X)
+    {
+      __m128i params[2][2][6];
+
+      for (int k = 0; k < 2; ++k)
+      {
+        const alf_classifier* cl = &pClass[j + 4 * k];
+
+        const int transpose_idx = cl->transpose_idx;
+        const int class_idx = cl->class_idx;
+
+        static_assert(sizeof(*filter_set) == 2, "ALF coeffs must be 16-bit wide");
+        static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide");
+
+        __m128i rawCoeff0, rawCoeff1;
+        __m128i rawClip0, rawClip1;
+
+        rawCoeff0 = _mm_loadu_si128((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF));
+        rawCoeff1 = _mm_loadl_epi64((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
+
+        rawClip0 = _mm_loadu_si128((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF));
+        rawClip1 = _mm_loadl_epi64((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
+
+        const __m128i s0 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][0]);
+        const __m128i s1 = _mm_xor_si128(s0, _mm_set1_epi8((char)0x80));
+        const __m128i s2 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][1]);
+        const __m128i s3 = _mm_xor_si128(s2, _mm_set1_epi8((char)0x80));
+
+        const __m128i rawCoeffLo = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s0), _mm_shuffle_epi8(rawCoeff1, s1));
+        const __m128i rawCoeffHi = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s2), _mm_shuffle_epi8(rawCoeff1, s3));
+        const __m128i rawClipLo = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s0), _mm_shuffle_epi8(rawClip1, s1));
+        const __m128i rawClipHi = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s2), _mm_shuffle_epi8(rawClip1, s3));
+
+        params[k][0][0] = _mm_shuffle_epi32(rawCoeffLo, 0x00);
+        params[k][0][1] = _mm_shuffle_epi32(rawCoeffLo, 0x55);
+        params[k][0][2] = _mm_shuffle_epi32(rawCoeffLo, 0xaa);
+        params[k][0][3] = _mm_shuffle_epi32(rawCoeffLo, 0xff);
+        params[k][0][4] = _mm_shuffle_epi32(rawCoeffHi, 0x00);
+        params[k][0][5] = _mm_shuffle_epi32(rawCoeffHi, 0x55);
+        params[k][1][0] = _mm_shuffle_epi32(rawClipLo, 0x00);
+        params[k][1][1] = _mm_shuffle_epi32(rawClipLo, 0x55);
+        params[k][1][2] = _mm_shuffle_epi32(rawClipLo, 0xaa);
+        params[k][1][3] = _mm_shuffle_epi32(rawClipLo, 0xff);
+        params[k][1][4] = _mm_shuffle_epi32(rawClipHi, 0x00);
+        params[k][1][5] = _mm_shuffle_epi32(rawClipHi, 0x55);
+      }
+
+      for (size_t ii = 0; ii < STEP_Y; ii++)
+      {
+        const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4, * pImg5, * pImg6;
+
+        pImg0 = src + j + ii * srcStride;
+        pImg1 = pImg0 + srcStride;
+        pImg2 = pImg0 - srcStride;
+        pImg3 = pImg1 + srcStride;
+        pImg4 = pImg2 - srcStride;
+        pImg5 = pImg3 + srcStride;
+        pImg6 = pImg4 - srcStride;
+
+        const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
+        if (yVb < vb_pos && (yVb >= vb_pos - 4))   // above
+        {
+          pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
+          pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
+          pImg5 = (yVb >= vb_pos - 3) ? pImg3 : pImg5;
+
+          pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
+          pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
+          pImg6 = (yVb >= vb_pos - 3) ? pImg4 : pImg6;
+        }
+        else if (yVb >= vb_pos && (yVb <= vb_pos + 3))   // bottom
+        {
+          pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
+          pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
+          pImg6 = (yVb <= vb_pos + 2) ? pImg4 : pImg6;
+
+          pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
+          pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
+          pImg5 = (yVb <= vb_pos + 2) ? pImg3 : pImg5;
+        }
+        __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
+
+        __m128i accumA = mmOffset;
+        __m128i accumB = mmOffset;
+
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 0, pImg5 + 0, pImg6 + 0, pImg3 + 1, pImg4 - 1);
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 1, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1);
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 2, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1);
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 3, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 4, pImg1 - 2, pImg2 + 2, pImg0 + 3, pImg0 - 3);
+        process2coeffs_7x7(params, &cur, &accumA, &accumB, 5, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
+
+
+        bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
+        bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
+        if (!(isNearVBabove || isNearVBbelow))
+        {
+          accumA = _mm_srai_epi32(accumA, SHIFT);
+          accumB = _mm_srai_epi32(accumB, SHIFT);
+        }
+        else
+        {
+          accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
+          accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
+        }
+        accumA = _mm_packs_epi32(accumA, accumB);
+        accumA = _mm_add_epi16(accumA, cur);
+        accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));       
+
+        //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
+        _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
+      }
+    }
+
+    src += srcStride * STEP_Y;
+    dst += dstStride * STEP_Y;
+  }
+}
+
+
+
 #endif // KVZ_BIT_DEPTH == 8
 #endif //COMPILE_INTEL_SSE41
 
@@ -349,6 +753,8 @@ int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) {
 #if KVZ_BIT_DEPTH == 8
   if (bitdepth == 8){
     success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41);
+    success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "sse41", 0, &alf_filter_5x5_block_sse41);
+    success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "sse41", 0, &alf_filter_7x7_block_sse41);
   }
 #endif // KVZ_BIT_DEPTH == 8
 #endif

From 8ef3e6a12651d4696712738a67159dfb149b0bd4 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 25 Aug 2021 20:22:24 +0300
Subject: [PATCH 06/13] [alf] Add strategy for alf_get_blk_stats() and an
 initial AVX2 version

---
 build/kvazaar_lib/kvazaar_lib.vcxproj         |   7 +
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters |   6 +
 src/Makefile.am                               |   4 +-
 src/alf.c                                     | 260 +-----------
 src/strategies/avx2/alf-avx2.c                | 393 ++++++++++++++++++
 src/strategies/avx2/alf-avx2.h                |  32 ++
 src/strategies/generic/alf-generic.c          | 265 ++++++++++++
 src/strategies/strategies-alf.c               |   5 +
 src/strategies/strategies-alf.h               |  20 +
 9 files changed, 732 insertions(+), 260 deletions(-)
 create mode 100644 src/strategies/avx2/alf-avx2.c
 create mode 100644 src/strategies/avx2/alf-avx2.h

diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index 2ac2e406..67ee5ac4 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -170,6 +170,12 @@
     <ClCompile Include="..\..\src\search.c" />
     <ClCompile Include="..\..\src\search_inter.c" />
     <ClCompile Include="..\..\src\search_intra.c" />
+    <ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
@@ -264,6 +270,7 @@
     <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\alf-generic.h" />
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index 87f212e0..d547dfb4 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -263,6 +263,9 @@
     <ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c">
       <Filter>Optimization\strategies\sse41</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -491,6 +494,9 @@
     <ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h">
       <Filter>Optimization\strategies\sse41</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
diff --git a/src/Makefile.am b/src/Makefile.am
index af098c39..706dc9aa 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -196,7 +196,9 @@ libavx2_la_SOURCES = \
 	strategies/avx2/quant-avx2.h \
 	strategies/avx2/reg_sad_pow2_widths-avx2.h \
 	strategies/avx2/sao-avx2.c \
-	strategies/avx2/sao-avx2.h
+	strategies/avx2/sao-avx2.h \
+	strategies/avx2/alf-avx2.c \
+	strategies/avx2/alf-avx2.h
 #	strategies/avx2/encode_coding_tree-avx2.c \
 #	strategies/avx2/encode_coding_tree-avx2.h
 
diff --git a/src/alf.c b/src/alf.c
index d591aaed..fa5d7aa3 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state,
   assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma
 }
 
-static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
-  const kvz_pixel *rec,
-  const int stride,
-  const channel_type channel,
-  const int transpose_idx,
-  int vb_distance,
-  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
-{
-  static const int alf_pattern_5[13] = {
-              0,
-          1,  2,  3,
-      4,  5,  6,  5,  4,
-          3,  2,  1,
-              0
-  };
-
-  static const int alf_pattern_7[25] = {
-                0,
-            1,  2,  3,
-        4,  5,  6,  7,  8,
-    9, 10, 11, 12, 11, 10, 9,
-        8,  7,  6,  5,  4,
-            3,  2,  1,
-                0
-  };
-
-  int clip_top_row = -4;
-  int clip_bot_row = 4;
-  if (vb_distance >= -3 && vb_distance < 0)
-  {
-    clip_bot_row = -vb_distance - 1;
-    clip_top_row = -clip_bot_row; // symmetric
-  }
-  else if (vb_distance >= 0 && vb_distance < 3)
-  {
-    clip_top_row = -vb_distance;
-    clip_bot_row = -clip_top_row; // symmetric
-  }
-
-  const bool is_luma = channel == CHANNEL_TYPE_LUMA;
-  const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
-  const int half_filter_length = (is_luma ? 7 : 5) >> 1;
-  const short* clip = alf_clipping_values[channel];
-  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
-
-  int k = 0;
-
-  const int16_t curr = rec[0];
-
-  if (transpose_idx == 0)
-  {
-    for (int i = -half_filter_length; i < 0; i++)
-    {
-      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
-      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
-      for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
-      {
-        for (int b = 0; b < num_bins; b++)
-        {
-          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
-        }
-      }
-    }
-    for (int j = -half_filter_length; j < 0; j++, k++)
-    {
-      for (int b = 0; b < num_bins; b++)
-      {
-        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
-      }
-    }
-  }
-  else if (transpose_idx == 1)
-  {
-    for (int j = -half_filter_length; j < 0; j++)
-    {
-      const kvz_pixel* rec0 = rec + j;
-      const kvz_pixel* rec1 = rec - j;
-
-      for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
-      {
-        for (int b = 0; b < num_bins; b++)
-        {
-          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
-        }
-      }
-    }
-    for (int i = -half_filter_length; i < 0; i++, k++)
-    {
-      for (int b = 0; b < num_bins; b++)
-      {
-        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
-      }
-    }
-  }
-  else if (transpose_idx == 2)
-  {
-    for (int i = -half_filter_length; i < 0; i++)
-    {
-      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
-      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
-
-      for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
-      {
-        for (int b = 0; b < num_bins; b++)
-        {
-          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
-        }
-      }
-    }
-    for (int j = -half_filter_length; j < 0; j++, k++)
-    {
-      for (int b = 0; b < num_bins; b++)
-      {
-        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
-      }
-    }
-  }
-  else
-  {
-    for (int j = -half_filter_length; j < 0; j++)
-    {
-      const kvz_pixel* rec0 = rec + j;
-      const kvz_pixel* rec1 = rec - j;
-
-      for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
-      {
-        for (int b = 0; b < num_bins; b++)
-        {
-          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
-        }
-      }
-    }
-    for (int i = -half_filter_length; i < 0; i++, k++)
-    {
-      for (int b = 0; b < num_bins; b++)
-      {
-        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
-      }
-    }
-  }
-  for (int b = 0; b < num_bins; b++)
-  {
-    e_local[filter_pattern[k]][b] += curr;
-  }
-}
-
-static void alf_get_blk_stats(encoder_state_t * const state,
-  channel_type channel,
-  alf_covariance *alf_covariance,
-  alf_classifier **g_classifier,
-  kvz_pixel *org,
-  int32_t org_stride,
-  kvz_pixel *rec,
-  int32_t rec_stride,
-  const int x_pos,
-  const int y_pos,
-  const int x_dst,
-  const int y_dst,
-  const int width,
-  const int height,
-  int vb_ctu_height,
-  int vb_pos,
-  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
-{
-  int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
-
-  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
-
-  int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
-  int transpose_idx = 0;
-  int class_idx = 0;
-
-  for (int i = 0; i < height; i++)
-  {
-    int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
-    for (int j = 0; j < width; j++)
-    {
-      if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
-      {
-        continue;
-      }
-      memset(e_local, 0, sizeof(e_local));
-      if (g_classifier)
-      {
-        alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
-        transpose_idx = cl->transpose_idx;
-        class_idx = cl->class_idx;
-      }
-
-      double weight = 1.0;
-      if (0/*m_alfWSSD*/)
-      {
-        //weight = g_luma_level_to_weight_plut[org[j]];
-      }
-      int16_t y_local = org[j] - rec[j];
-      alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
-      for (int k = 0; k < num_coeff; k++)
-      {
-        for (int l = k; l < num_coeff; l++)
-        {
-          for (int b0 = 0; b0 < num_bins; b0++)
-          {
-            for (int b1 = 0; b1 < num_bins; b1++)
-            {
-              if (0/*m_alfWSSD*/)
-              {
-                alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
-              }
-              else
-              {
-                alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
-              }
-            }
-          }
-        }
-        for (int b = 0; b < num_bins; b++)
-        {
-          if (0/*m_alfWSSD*/)
-          {
-            alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
-          }
-          else
-          {
-            alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
-          }
-        }
-      }
-      if (0/*m_alfWSSD*/)
-      {
-        alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
-      }
-      else
-      {
-        alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
-      }
-    }
-    org += org_stride;
-    rec += rec_stride;
-  }
-
-  int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
-  for (class_idx = 0; class_idx < num_classes; class_idx++)
-  {
-    for (int k = 1; k < num_coeff; k++)
-    {
-      for (int l = 0; l < k; l++)
-      {
-        for (int b0 = 0; b0 < num_bins; b0++)
-        {
-          for (int b1 = 0; b1 < num_bins; b1++)
-          {
-            alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
-          }
-        }
-      }
-    }
-  }
-}
 
 static void alf_derive_stats_for_filtering(encoder_state_t * const state,
   short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
@@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state,
 
         const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1;
         const int cov_index = ctu_rs_addr * num_classes;
-        alf_get_blk_stats(state, ch_type,
+        kvz_alf_get_blk_stats(state, ch_type,
           &alf_cov[cov_index],
           comp_idx ? NULL : alf_info->classifier,
           org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h,
diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
new file mode 100644
index 00000000..70e5afe5
--- /dev/null
+++ b/src/strategies/avx2/alf-avx2.c
@@ -0,0 +1,393 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "global.h"
+
+#include "strategies/avx2/alf-avx2.h"
+
+#if COMPILE_INTEL_AVX2
+#include "kvazaar.h"
+#if KVZ_BIT_DEPTH == 8
+
+#include <immintrin.h>
+#include <stdlib.h>
+
+#include "strategyselector.h"
+
+static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
+{
+  return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
+}
+
+
+static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
+  const kvz_pixel* rec,
+  const int stride,
+  const channel_type channel,
+  const int transpose_idx,
+  int vb_distance,
+  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+  static const int alf_pattern_5[13] = {
+              0,
+          1,  2,  3,
+      4,  5,  6,  5,  4,
+          3,  2,  1,
+              0
+  };
+
+  static const int alf_pattern_7[25] = {
+                0,
+            1,  2,  3,
+        4,  5,  6,  7,  8,
+    9, 10, 11, 12, 11, 10, 9,
+        8,  7,  6,  5,  4,
+            3,  2,  1,
+                0
+  };
+
+  int clip_top_row = -4;
+  int clip_bot_row = 4;
+  if (vb_distance >= -3 && vb_distance < 0)
+  {
+    clip_bot_row = -vb_distance - 1;
+    clip_top_row = -clip_bot_row; // symmetric
+  }
+  else if (vb_distance >= 0 && vb_distance < 3)
+  {
+    clip_top_row = -vb_distance;
+    clip_bot_row = -clip_top_row; // symmetric
+  }
+
+  const bool is_luma = channel == CHANNEL_TYPE_LUMA;
+  const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
+  const int half_filter_length = (is_luma ? 7 : 5) >> 1;
+  const short* clip = alf_clipping_values[channel];
+  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+  int k = 0;
+
+  const int16_t curr = rec[0];
+
+  const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+
+  if (transpose_idx == 0)
+  {
+    for (int i = -half_filter_length; i < 0; i++)
+    {
+      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+      for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
+      {
+        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+        __m128i neg_clips = _mm_sign_epi16(clips, negate);
+        __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
+        __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));          
+        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+          
+        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+        __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]);          
+        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+      }
+    }
+    for (int j = -half_filter_length; j < 0; j++, k++)
+    {
+      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+      __m128i neg_clips = _mm_sign_epi16(clips, negate);
+      __m128i val0 = _mm_set1_epi16((rec[j] - curr));
+      __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
+      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+      _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+    }
+  }
+  else if (transpose_idx == 1)
+  {
+    for (int j = -half_filter_length; j < 0; j++)
+    {
+      const kvz_pixel* rec0 = rec + j;
+      const kvz_pixel* rec1 = rec - j;
+
+      for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
+      {
+        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+        __m128i neg_clips = _mm_sign_epi16(clips, negate);
+        __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
+        __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
+        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+      }
+    }
+    for (int i = -half_filter_length; i < 0; i++, k++)
+    {
+      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+      __m128i neg_clips = _mm_sign_epi16(clips, negate);
+      __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
+      __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
+      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+    }
+  }
+  else if (transpose_idx == 2)
+  {
+    for (int i = -half_filter_length; i < 0; i++)
+    {
+      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+
+      for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
+      {
+        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+        __m128i neg_clips = _mm_sign_epi16(clips, negate);
+        __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
+        __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
+        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+      }
+    }
+    for (int j = -half_filter_length; j < 0; j++, k++)
+    {
+      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+      __m128i neg_clips = _mm_sign_epi16(clips, negate);
+      __m128i val0 = _mm_set1_epi16((rec[j] - curr));
+      __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
+      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+    }
+  }
+  else
+  {
+    for (int j = -half_filter_length; j < 0; j++)
+    {
+      const kvz_pixel* rec0 = rec + j;
+      const kvz_pixel* rec1 = rec - j;
+
+      for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
+      {
+        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+        __m128i neg_clips = _mm_sign_epi16(clips, negate);
+        __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
+        __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
+        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+        _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+      }
+    }
+    for (int i = -half_filter_length; i < 0; i++, k++)
+    {
+      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+      __m128i neg_clips = _mm_sign_epi16(clips, negate);
+      __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
+      __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
+      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+    }
+  }
+
+  __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+  __m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr));
+  _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+
+}
+
+static void alf_get_blk_stats_avx2(encoder_state_t* const state,
+  channel_type channel,
+  alf_covariance* alf_covariance,
+  alf_classifier** g_classifier,
+  kvz_pixel* org,
+  int32_t org_stride,
+  kvz_pixel* rec,
+  int32_t rec_stride,
+  const int x_pos,
+  const int y_pos,
+  const int x_dst,
+  const int y_dst,
+  const int width,
+  const int height,
+  int vb_ctu_height,
+  int vb_pos,
+  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+  int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
+
+  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+  int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
+  int transpose_idx = 0;
+  int class_idx = 0;
+
+  for (int i = 0; i < height; i++)
+  {
+    int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
+    for (int j = 0; j < width; j++)
+    {
+      if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
+      {
+        continue;
+      }
+      memset(e_local, 0, sizeof(e_local));
+      if (g_classifier)
+      {
+        alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
+        transpose_idx = cl->transpose_idx;
+        class_idx = cl->class_idx;
+      }
+
+      int16_t y_local = org[j] - rec[j];
+
+      __m256d y_local_d = _mm256_set1_pd((double)y_local);
+      alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
+      for (int k = 0; k < num_coeff; k++)
+      {
+        for (int l = k; l < num_coeff; l++)
+        {
+          for (int b0 = 0; b0 < 4; b0++)
+          {
+            __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]);
+            //for (int b1 = 0; b1 < 4; b1++)
+            {
+
+              //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
+              __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
+              __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
+              __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
+              
+              __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d);
+              double data[4];
+              _mm256_store_pd(data, multiplied);
+
+              //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+              alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
+              alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
+              alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
+              alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
+            }
+          }
+        }
+        //for (int b = 0; b < 4; b++)
+        {
+          __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
+          __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
+          __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
+
+          __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d);
+
+          __m128i output = _mm256_cvtpd_epi32(multiplied);
+
+          int32_t data[4];
+          _mm_storeu_si128((__m128i*)data, output);
+          //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+          alf_covariance[class_idx].y[0][k] += data[0];
+          alf_covariance[class_idx].y[1][k] += data[1];
+          alf_covariance[class_idx].y[2][k] += data[2];
+          alf_covariance[class_idx].y[3][k] += data[3];
+        }
+      }
+      alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
+    }
+    org += org_stride;
+    rec += rec_stride;
+  }
+
+  int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
+  for (class_idx = 0; class_idx < num_classes; class_idx++)
+  {
+    for (int k = 1; k < num_coeff; k++)
+    {
+      for (int l = 0; l < k; l++)
+      {
+        for (int b0 = 0; b0 < 4; b0++)
+        {
+          for (int b1 = 0; b1 < 4; b1++)
+          {
+            alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif // KVZ_BIT_DEPTH == 8
+#endif //COMPILE_INTEL_AVX2
+
+
+int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+#if COMPILE_INTEL_AVX2
+#if KVZ_BIT_DEPTH == 8
+  if (bitdepth == 8){
+    success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2);
+  }
+#endif // KVZ_BIT_DEPTH == 8
+#endif
+  return success;
+}
diff --git a/src/strategies/avx2/alf-avx2.h b/src/strategies/avx2/alf-avx2.h
new file mode 100644
index 00000000..a791b6f8
--- /dev/null
+++ b/src/strategies/avx2/alf-avx2.h
@@ -0,0 +1,32 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+
+int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth);
+
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
index d9aa3b95..e37acbb6 100644
--- a/src/strategies/generic/alf-generic.c
+++ b/src/strategies/generic/alf-generic.c
@@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state,
   alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
 }
 
+
+
+
+static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
+  const kvz_pixel* rec,
+  const int stride,
+  const channel_type channel,
+  const int transpose_idx,
+  int vb_distance,
+  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+  static const int alf_pattern_5[13] = {
+              0,
+          1,  2,  3,
+      4,  5,  6,  5,  4,
+          3,  2,  1,
+              0
+  };
+
+  static const int alf_pattern_7[25] = {
+                0,
+            1,  2,  3,
+        4,  5,  6,  7,  8,
+    9, 10, 11, 12, 11, 10, 9,
+        8,  7,  6,  5,  4,
+            3,  2,  1,
+                0
+  };
+
+  int clip_top_row = -4;
+  int clip_bot_row = 4;
+  if (vb_distance >= -3 && vb_distance < 0)
+  {
+    clip_bot_row = -vb_distance - 1;
+    clip_top_row = -clip_bot_row; // symmetric
+  }
+  else if (vb_distance >= 0 && vb_distance < 3)
+  {
+    clip_top_row = -vb_distance;
+    clip_bot_row = -clip_top_row; // symmetric
+  }
+
+  const bool is_luma = channel == CHANNEL_TYPE_LUMA;
+  const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
+  const int half_filter_length = (is_luma ? 7 : 5) >> 1;
+  const short* clip = alf_clipping_values[channel];
+  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+  int k = 0;
+
+  const int16_t curr = rec[0];
+
+  if (transpose_idx == 0)
+  {
+    for (int i = -half_filter_length; i < 0; i++)
+    {
+      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+      for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
+      {
+        for (int b = 0; b < num_bins; b++)
+        {
+          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
+        }
+      }
+    }
+    for (int j = -half_filter_length; j < 0; j++, k++)
+    {
+      for (int b = 0; b < num_bins; b++)
+      {
+        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
+      }
+    }
+  }
+  else if (transpose_idx == 1)
+  {
+    for (int j = -half_filter_length; j < 0; j++)
+    {
+      const kvz_pixel* rec0 = rec + j;
+      const kvz_pixel* rec1 = rec - j;
+
+      for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
+      {
+        for (int b = 0; b < num_bins; b++)
+        {
+          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
+        }
+      }
+    }
+    for (int i = -half_filter_length; i < 0; i++, k++)
+    {
+      for (int b = 0; b < num_bins; b++)
+      {
+        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
+      }
+    }
+  }
+  else if (transpose_idx == 2)
+  {
+    for (int i = -half_filter_length; i < 0; i++)
+    {
+      const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+      const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+
+      for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
+      {
+        for (int b = 0; b < num_bins; b++)
+        {
+          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
+        }
+      }
+    }
+    for (int j = -half_filter_length; j < 0; j++, k++)
+    {
+      for (int b = 0; b < num_bins; b++)
+      {
+        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
+      }
+    }
+  }
+  else
+  {
+    for (int j = -half_filter_length; j < 0; j++)
+    {
+      const kvz_pixel* rec0 = rec + j;
+      const kvz_pixel* rec1 = rec - j;
+
+      for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
+      {
+        for (int b = 0; b < num_bins; b++)
+        {
+          e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
+        }
+      }
+    }
+    for (int i = -half_filter_length; i < 0; i++, k++)
+    {
+      for (int b = 0; b < num_bins; b++)
+      {
+        e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
+      }
+    }
+  }
+  for (int b = 0; b < num_bins; b++)
+  {
+    e_local[filter_pattern[k]][b] += curr;
+  }
+}
+
+static void alf_get_blk_stats_generic(encoder_state_t* const state,
+  channel_type channel,
+  alf_covariance* alf_covariance,
+  alf_classifier** g_classifier,
+  kvz_pixel* org,
+  int32_t org_stride,
+  kvz_pixel* rec,
+  int32_t rec_stride,
+  const int x_pos,
+  const int y_pos,
+  const int x_dst,
+  const int y_dst,
+  const int width,
+  const int height,
+  int vb_ctu_height,
+  int vb_pos,
+  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+  int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
+
+  const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+  int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
+  int transpose_idx = 0;
+  int class_idx = 0;
+
+  for (int i = 0; i < height; i++)
+  {
+    int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
+    for (int j = 0; j < width; j++)
+    {
+      if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
+      {
+        continue;
+      }
+      memset(e_local, 0, sizeof(e_local));
+      if (g_classifier)
+      {
+        alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
+        transpose_idx = cl->transpose_idx;
+        class_idx = cl->class_idx;
+      }
+
+      double weight = 1.0;
+      if (0/*m_alfWSSD*/)
+      {
+        //weight = g_luma_level_to_weight_plut[org[j]];
+      }
+      int16_t y_local = org[j] - rec[j];
+      alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
+      for (int k = 0; k < num_coeff; k++)
+      {
+        for (int l = k; l < num_coeff; l++)
+        {
+          for (int b0 = 0; b0 < num_bins; b0++)
+          {
+            for (int b1 = 0; b1 < num_bins; b1++)
+            {
+              if (0/*m_alfWSSD*/)
+              {
+                alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
+              }
+              else
+              {
+                alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+              }
+            }
+          }
+        }
+        for (int b = 0; b < num_bins; b++)
+        {
+          if (0/*m_alfWSSD*/)
+          {
+            alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
+          }
+          else
+          {
+            alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+          }
+        }
+      }
+      if (0/*m_alfWSSD*/)
+      {
+        alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
+      }
+      else
+      {
+        alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
+      }
+    }
+    org += org_stride;
+    rec += rec_stride;
+  }
+
+  int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
+  for (class_idx = 0; class_idx < num_classes; class_idx++)
+  {
+    for (int k = 1; k < num_coeff; k++)
+    {
+      for (int l = 0; l < k; l++)
+      {
+        for (int b0 = 0; b0 < num_bins; b0++)
+        {
+          for (int b1 = 0; b1 < num_bins; b1++)
+          {
+            alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+          }
+        }
+      }
+    }
+  }
+}
+
+
 int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
   success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
   success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic);
   success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic);
+  success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic);
+  
 
   return success;
 }
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
index d5d99936..666e8cb5 100644
--- a/src/strategies/strategies-alf.c
+++ b/src/strategies/strategies-alf.c
@@ -20,6 +20,7 @@
 
 #include "strategies/strategies-alf.h"
 #include "strategies/sse41/alf-sse41.h"
+#include "strategies/avx2/alf-avx2.h"
 #include "strategies/generic/alf-generic.h"
 #include "strategyselector.h"
 
@@ -28,6 +29,7 @@
 alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
 alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
 alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
+alf_get_blk_stats_func* kvz_alf_get_blk_stats;
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
   bool success = true;
@@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
   if (kvz_g_hardware_flags.intel_flags.sse41) {
     success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
   }
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
+  }
 
   return success;
 }
\ No newline at end of file
diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h
index 8ad15384..32a650f7 100644
--- a/src/strategies/strategies-alf.h
+++ b/src/strategies/strategies-alf.h
@@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
   int vb_pos,
   const int vb_ctu_height);
 
+typedef void (alf_get_blk_stats_func)(encoder_state_t* const state,
+  channel_type channel,
+  alf_covariance* alf_covariance,
+  alf_classifier** g_classifier,
+  kvz_pixel* org,
+  int32_t org_stride,
+  kvz_pixel* rec,
+  int32_t rec_stride,
+  const int x_pos,
+  const int y_pos,
+  const int x_dst,
+  const int y_dst,
+  const int width,
+  const int height,
+  int vb_ctu_height,
+  int vb_pos,
+  short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]);
+
 // Declare function pointers.
 extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
 extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
 extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
+extern alf_get_blk_stats_func* kvz_alf_get_blk_stats;
 
 int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
 
@@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
   {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
   {"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
   {"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
+  {"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \
  
 

From 915bf3ca2456372c50887d459c41213ba28bf784 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 25 Aug 2021 20:29:58 +0300
Subject: [PATCH 07/13] [alf] Fix AVX2 priority

---
 src/strategies/avx2/alf-avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index 70e5afe5..a2e09483 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -385,7 +385,7 @@ int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
 #if COMPILE_INTEL_AVX2
 #if KVZ_BIT_DEPTH == 8
   if (bitdepth == 8){
-    success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2);
+    success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "avx2", 40, &alf_get_blk_stats_avx2);
   }
 #endif // KVZ_BIT_DEPTH == 8
 #endif

From f4de5cfd0f7594970bcd01f467708776ad6f8eae Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 26 Aug 2021 10:20:57 +0300
Subject: [PATCH 08/13] [alf] Cleanup alf_calc_covariance_avx2() and use
 integers in alf_get_blk_stats_avx2()

---
 src/strategies/avx2/alf-avx2.c | 196 ++++++++++-----------------------
 1 file changed, 58 insertions(+), 138 deletions(-)

diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index a2e09483..7fd625b4 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -36,6 +36,19 @@ static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val
   return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
 }
 
+#define ALF_CLIP_AND_ADD(VAL0,VAL1) __m128i clips = _mm_loadl_epi64((__m128i*) clip); \
+__m128i neg_clips = _mm_sign_epi16(clips, negate); \
+__m128i val0 = _mm_set1_epi16((VAL0 - curr));\
+__m128i val1 = _mm_set1_epi16((VAL1 - curr));\
+__m128i min_clips_val0 = _mm_min_epi16(clips, val0);\
+__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);\
+\
+__m128i min_clips_val1 = _mm_min_epi16(clips, val1);\
+__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);\
+\
+__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);\
+__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));\
+_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
 
 static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
   const kvz_pixel* rec,
@@ -96,36 +109,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX
       const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
       for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
       {
-        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-        __m128i neg_clips = _mm_sign_epi16(clips, negate);
-        __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
-        __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));          
-        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-          
-        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-        __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]);          
-        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+        ALF_CLIP_AND_ADD(rec0[j], rec1[-j]);
       }
     }
     for (int j = -half_filter_length; j < 0; j++, k++)
     {
-      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-      __m128i neg_clips = _mm_sign_epi16(clips, negate);
-      __m128i val0 = _mm_set1_epi16((rec[j] - curr));
-      __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
-      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-      _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+      ALF_CLIP_AND_ADD(rec[j], rec[-j]);
     }
   }
   else if (transpose_idx == 1)
@@ -137,36 +126,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX
 
       for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
       {
-        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-        __m128i neg_clips = _mm_sign_epi16(clips, negate);
-        __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
-        __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
-        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+        ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
       }
     }
     for (int i = -half_filter_length; i < 0; i++, k++)
     {
-      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-      __m128i neg_clips = _mm_sign_epi16(clips, negate);
-      __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
-      __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
-      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+      ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
     }
   }
   else if (transpose_idx == 2)
@@ -178,36 +143,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX
 
       for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
       {
-        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-        __m128i neg_clips = _mm_sign_epi16(clips, negate);
-        __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
-        __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
-        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-        _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+        ALF_CLIP_AND_ADD(rec0[j], rec1[-j]);
       }
     }
     for (int j = -half_filter_length; j < 0; j++, k++)
     {
-      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-      __m128i neg_clips = _mm_sign_epi16(clips, negate);
-      __m128i val0 = _mm_set1_epi16((rec[j] - curr));
-      __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
-      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+      ALF_CLIP_AND_ADD(rec[j], rec[-j]);
     }
   }
   else
@@ -219,36 +160,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX
 
       for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
       {
-        __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-        __m128i neg_clips = _mm_sign_epi16(clips, negate);
-        __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
-        __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
-        __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-        __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-        __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-        __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-        __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-        __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-        _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+        ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
       }
     }
     for (int i = -half_filter_length; i < 0; i++, k++)
     {
-      __m128i clips = _mm_loadl_epi64((__m128i*) clip);
-      __m128i neg_clips = _mm_sign_epi16(clips, negate);
-      __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
-      __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
-      __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
-      __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
-
-      __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
-      __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
-
-      __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
-      __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
-      _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+      ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
     }
   }
 
@@ -303,7 +220,9 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
 
       int16_t y_local = org[j] - rec[j];
 
-      __m256d y_local_d = _mm256_set1_pd((double)y_local);
+      //__m256i const perm_mask = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, 0);
+
+      __m256i y_local_32 = _mm256_set1_epi32(y_local);
       alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
       for (int k = 0; k < num_coeff; k++)
       {
@@ -311,45 +230,46 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         {
           for (int b0 = 0; b0 < 4; b0++)
           {
-            __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]);
-            //for (int b1 = 0; b1 < 4; b1++)
+            if (!e_local[k][b0]) continue;
+            __m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]);
+            /*for (int b1 = 0; b1 < 4; b1++)
             {
+              alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+            }*/
+            
+            //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
+            __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
+            __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
+            __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
+            int64_t data[4];
+            _mm256_storeu_si256((__m256i*)data, multiplied);
 
-              //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
-              __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
-              __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
-              __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
               
-              __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d);
-              double data[4];
-              _mm256_store_pd(data, multiplied);
-
-              //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
-              alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
-              alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
-              alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
-              alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
-            }
+            alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
+            alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
+            alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
+            alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
+            
           }
         }
-        //for (int b = 0; b < 4; b++)
+        /*
+        for (int b = 0; b < 4; b++)
         {
-          __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
-          __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
-          __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
-
-          __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d);
-
-          __m128i output = _mm256_cvtpd_epi32(multiplied);
-
-          int32_t data[4];
-          _mm_storeu_si128((__m128i*)data, output);
-          //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
-          alf_covariance[class_idx].y[0][k] += data[0];
-          alf_covariance[class_idx].y[1][k] += data[1];
-          alf_covariance[class_idx].y[2][k] += data[2];
-          alf_covariance[class_idx].y[3][k] += data[3];
-        }
+          alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+        }*/
+        
+        __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
+        __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
+        __m256i multiplied = _mm256_mul_epi32(y_local_32, e_local_32);          
+        //__m256i output = _mm256_permutevar8x32_epi32(multiplied, perm_mask);
+          
+        int64_t data[4];
+        _mm256_storeu_si256((__m256i*)data, multiplied);
+          
+        alf_covariance[class_idx].y[0][k] += data[0];
+        alf_covariance[class_idx].y[1][k] += data[1];
+        alf_covariance[class_idx].y[2][k] += data[2];
+        alf_covariance[class_idx].y[3][k] += data[3];
       }
       alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
     }

From be9527cf1d1c4a7f219d3d7bf58077939826ae68 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 26 Aug 2021 11:07:13 +0300
Subject: [PATCH 09/13] [alf] Change the order of alf_covariance.ee values to
 get better optimized solution for alf_get_blk_stats_avx2()

---
 src/alf.c                            | 42 ++++++++++++++--------------
 src/alf.h                            |  2 +-
 src/strategies/avx2/alf-avx2.c       | 17 ++++-------
 src/strategies/generic/alf-generic.c |  6 ++--
 4 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/src/alf.c b/src/alf.c
index fa5d7aa3..6f1a3939 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -119,7 +119,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max)
     {
       for (int l = 0; inc && l < num_coeff; ++l)
       {
-        if (cov->ee[clip_max[k]][0][k][l] != cov->ee[clip_max[k] + 1][0][k][l])
+        if (cov->ee[k][l][clip_max[k]][0] != cov->ee[k][l][clip_max[k] + 1][0])
         {
           inc = false;
         }
@@ -142,7 +142,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip)
     {
       for (int l = 0; dec && l < cov->num_coeff; ++l)
       {
-        if (cov->ee[clip[k]][clip[l]][k][l] != cov->ee[clip[k] - 1][clip[l]][k][l])
+        if (cov->ee[k][l][clip[k]][clip[l]] != cov->ee[k][l][clip[k] - 1][clip[l]])
         {
           dec = false;
         }
@@ -162,7 +162,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double
     y[k] = cov->y[clip[k]][k];
     for (int l = 0; l < size; l++)
     {
-      ee[k][l] = cov->ee[clip[k]][clip[l]][k][l];
+      ee[k][l] = cov->ee[k][l][clip[k]][clip[l]];
     }
   }
 }
@@ -352,8 +352,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
         ky[k] = cov->y[clip[k]][k];
         for (int l = 0; l < size; l++)
         {
-          ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
-          ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
+          ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
+          ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
         }
 
         gns_solve_by_chol(ke, ky, f, size);
@@ -373,8 +373,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
         ky[k] = cov->y[clip[k]][k];
         for (int l = 0; l < size; l++)
         {
-          ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
-          ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
+          ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
+          ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
         }
 
         gns_solve_by_chol(ke, ky, f, size);
@@ -392,8 +392,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
       ky[k] = cov->y[clip[k]][k];
       for (int l = 0; l < size; l++)
       {
-        ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
-        ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
+        ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
+        ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
       }
     }
 
@@ -404,8 +404,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
       ky[idx_min] = cov->y[clip[idx_min]][idx_min];
       for (int l = 0; l < size; l++)
       {
-        ke[idx_min][l] = cov->ee[clip[idx_min]][clip[l]][idx_min][l];
-        ke[l][idx_min] = cov->ee[clip[l]][clip[idx_min]][l][idx_min];
+        ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]];
+        ke[l][idx_min] = cov->ee[l][idx_min][clip[l]][clip[idx_min]];
       }
     }
     else
@@ -469,9 +469,9 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip,
     double sum = 0;
     for (int j = i + 1; j < num_coeff; j++)
     {
-      sum += cov->ee[clip[i]][clip[j]][i][j] * coeff[j];
+      sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j];
     }
-    error += ((cov->ee[clip[i]][clip[i]][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i];
+    error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i];
   }
 
   return error / factor;
@@ -488,9 +488,9 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1
     for (int j = i + 1; j < num_coeff; j++)
     {
       // E[j][i] = E[i][j], sum will be multiplied by 2 later
-      sum += cov->ee[0][0][i][j] * coeff[j];
+      sum += cov->ee[i][j][0][0] * coeff[j];
     }
-    error += ((cov->ee[0][0][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i];
+    error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i];
   }
 
   return error / factor;
@@ -753,7 +753,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src)
       {
         for (int i = 0; i < num_coeff; i++)
         {
-          dst->ee[b0][b1][j][i] += src->ee[b0][b1][j][i];
+          dst->ee[j][i][b0][b1] += src->ee[j][i][b0][b1];
         }
       }
     }
@@ -780,7 +780,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co
       {
         for (int i = 0; i < num_coeff; i++)
         {
-          dst->ee[b0][b1][j][i] = lhs->ee[b0][b1][j][i] + rhs->ee[b0][b1][j][i];
+          dst->ee[j][i][b0][b1] = lhs->ee[j][i][b0][b1] + rhs->ee[j][i][b0][b1];
         }
       }
     }
@@ -1972,7 +1972,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a
     ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k];
     for (int l = 0; l < size; l++)
     {
-      k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[0][0][k][l];
+      k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0];
     }
   }
 
@@ -2766,11 +2766,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state,
             {
               if (0 /*g_alf_wssd*/)
               {
-                alf_covariance->ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
+                alf_covariance->ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
               }
               else
               {
-                alf_covariance->ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+                alf_covariance->ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
               }
             }
           }
@@ -2826,7 +2826,7 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state,
       {
         for (int b1 = 0; b1 < num_bins; b1++)
         {
-          alf_covariance->ee[b0][b1][k][l] = alf_covariance->ee[b1][b0][l][k];
+          alf_covariance->ee[k][l][b0][b1] = alf_covariance->ee[l][k][b1][b0];
         }
       }
     }
diff --git a/src/alf.h b/src/alf.h
index 862b284d..793102ac 100644
--- a/src/alf.h
+++ b/src/alf.h
@@ -176,7 +176,7 @@ typedef enum {
 PACK(
 typedef struct alf_covariance {
   double pix_acc;  
-  int64_t ee[MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF];
+  int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES];
   int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF];
   int num_coeff;
   int num_bins;
diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index 7fd625b4..91b79287 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -234,22 +234,15 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
             __m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]);
             /*for (int b1 = 0; b1 < 4; b1++)
             {
-              alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+              alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
             }*/
             
-            //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
             __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
             __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
             __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
-            int64_t data[4];
-            _mm256_storeu_si256((__m256i*)data, multiplied);
-
-              
-            alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
-            alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
-            alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
-            alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
-            
+            __m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]);
+            _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig));
+           
           }
         }
         /*
@@ -288,7 +281,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         {
           for (int b1 = 0; b1 < 4; b1++)
           {
-            alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+            alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0];
           }
         }
       }
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
index e37acbb6..99def841 100644
--- a/src/strategies/generic/alf-generic.c
+++ b/src/strategies/generic/alf-generic.c
@@ -933,11 +933,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state,
             {
               if (0/*m_alfWSSD*/)
               {
-                alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
+                alf_covariance[class_idx].ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
               }
               else
               {
-                alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+                alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
               }
             }
           }
@@ -978,7 +978,7 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state,
         {
           for (int b1 = 0; b1 < num_bins; b1++)
           {
-            alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+            alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0];
           }
         }
       }

From 5df8add04610288ca188f344fa8e683d2d392ab1 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 26 Aug 2021 15:37:01 +0300
Subject: [PATCH 10/13] [alf] Change order of alf_covariance.y array for better
 AVX2 optimization in alf_get_blk_stats_avx2()

---
 src/alf.c                            | 30 ++++++++++++++--------------
 src/alf.h                            |  2 +-
 src/strategies/avx2/alf-avx2.c       | 17 +++++-----------
 src/strategies/generic/alf-generic.c |  4 ++--
 4 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/src/alf.c b/src/alf.c
index 6f1a3939..19c6893e 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -115,7 +115,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max)
     clip_max[k] = 0;
 
     bool inc = true;
-    while (inc && clip_max[k] + 1 < cov->num_bins && cov->y[clip_max[k] + 1][k] == cov->y[clip_max[k]][k])
+    while (inc && clip_max[k] + 1 < cov->num_bins && cov->y[k][clip_max[k] + 1] == cov->y[k][clip_max[k]])
     {
       for (int l = 0; inc && l < num_coeff; ++l)
       {
@@ -138,7 +138,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip)
   for (int k = 0; k < cov->num_coeff - 1; ++k)
   {
     bool dec = true;
-    while (dec && clip[k] > 0 && cov->y[clip[k] - 1][k] == cov->y[clip[k]][k])
+    while (dec && clip[k] > 0 && cov->y[k][clip[k] - 1] == cov->y[k][clip[k]])
     {
       for (int l = 0; dec && l < cov->num_coeff; ++l)
       {
@@ -159,7 +159,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double
 {
   for (int k = 0; k < size; k++)
   {
-    y[k] = cov->y[clip[k]][k];
+    y[k] = cov->y[k][clip[k]];
     for (int l = 0; l < size; l++)
     {
       ee[k][l] = cov->ee[k][l][clip[k]][clip[l]];
@@ -304,7 +304,7 @@ static double calculate_error(const alf_covariance *cov, const int *clip, const
   double sum = 0;
   for (int i = 0; i < cov->num_coeff; i++)
   {
-    sum += coeff[i] * cov->y[clip[i]][i];
+    sum += coeff[i] * cov->y[i][clip[i]];
   }
 
   return cov->pix_acc - sum;
@@ -349,7 +349,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
       if (clip[k] - step >= clip_max[k])
       {
         clip[k] -= step;
-        ky[k] = cov->y[clip[k]][k];
+        ky[k] = cov->y[k][clip[k]];
         for (int l = 0; l < size; l++)
         {
           ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
@@ -370,7 +370,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
       if (clip[k] + step < cov->num_bins)
       {
         clip[k] += step;
-        ky[k] = cov->y[clip[k]][k];
+        ky[k] = cov->y[k][clip[k]];
         for (int l = 0; l < size; l++)
         {
           ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
@@ -389,7 +389,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
         clip[k] -= step;
 
       }
-      ky[k] = cov->y[clip[k]][k];
+      ky[k] = cov->y[k][clip[k]];
       for (int l = 0; l < size; l++)
       {
         ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
@@ -401,7 +401,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
     {
       err_best = err_min;
       clip[idx_min] += inc_min;
-      ky[idx_min] = cov->y[clip[idx_min]][idx_min];
+      ky[idx_min] = cov->y[idx_min][clip[idx_min]];
       for (int l = 0; l < size; l++)
       {
         ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]];
@@ -471,7 +471,7 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip,
     {
       sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j];
     }
-    error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i];
+    error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[i][clip[i]]) * coeff[i];
   }
 
   return error / factor;
@@ -490,7 +490,7 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1
       // E[j][i] = E[i][j], sum will be multiplied by 2 later
       sum += cov->ee[i][j][0][0] * coeff[j];
     }
-    error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i];
+    error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[i][0]) * coeff[i];
   }
 
   return error / factor;
@@ -762,7 +762,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src)
   {
     for (int j = 0; j < num_coeff; j++)
     {
-      dst->y[b][j] += src->y[b][j];
+      dst->y[j][b] += src->y[j][b];
     }
   }
   dst->pix_acc += src->pix_acc;
@@ -789,7 +789,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co
   {
     for (int j = 0; j < num_coeff; j++)
     {
-      dst->y[b][j] = lhs->y[b][j] + rhs->y[b][j];
+      dst->y[j][b] = lhs->y[j][b] + rhs->y[j][b];
     }
   }
   dst->pix_acc = lhs->pix_acc + rhs->pix_acc;
@@ -1969,7 +1969,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a
 
   for (int k = 0; k < size; k++)
   {
-    ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k];
+    ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[k][0];
     for (int l = 0; l < size; l++)
     {
       k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0];
@@ -2779,11 +2779,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state,
         {
           if (0 /*g_alf_wssd*/)
           {
-            alf_covariance->y[b][k] += weight * (e_local[k][b] * (double)y_local);
+            alf_covariance->y[k][b] += weight * (e_local[k][b] * (double)y_local);
           }
           else
           {
-            alf_covariance->y[b][k] += e_local[k][b] * (double)y_local;
+            alf_covariance->y[k][b] += e_local[k][b] * (double)y_local;
           }
         }
       }
diff --git a/src/alf.h b/src/alf.h
index 793102ac..1cc0a71b 100644
--- a/src/alf.h
+++ b/src/alf.h
@@ -177,7 +177,7 @@ PACK(
 typedef struct alf_covariance {
   double pix_acc;  
   int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES];
-  int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF];
+  int32_t y[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
   int num_coeff;
   int num_bins;
 } alf_covariance;)
diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index 91b79287..1048e041 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -240,7 +240,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
             __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
             __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
             __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
-            __m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]);
+            __m256i orig = _mm256_loadu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]);
             _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig));
            
           }
@@ -248,21 +248,14 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         /*
         for (int b = 0; b < 4; b++)
         {
-          alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+          alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
         }*/
         
         __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
         __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
-        __m256i multiplied = _mm256_mul_epi32(y_local_32, e_local_32);          
-        //__m256i output = _mm256_permutevar8x32_epi32(multiplied, perm_mask);
-          
-        int64_t data[4];
-        _mm256_storeu_si256((__m256i*)data, multiplied);
-          
-        alf_covariance[class_idx].y[0][k] += data[0];
-        alf_covariance[class_idx].y[1][k] += data[1];
-        alf_covariance[class_idx].y[2][k] += data[2];
-        alf_covariance[class_idx].y[3][k] += data[3];
+        __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
+        __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
+        _mm_store_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
       }
       alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
     }
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
index 99def841..6758fa4e 100644
--- a/src/strategies/generic/alf-generic.c
+++ b/src/strategies/generic/alf-generic.c
@@ -946,11 +946,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state,
         {
           if (0/*m_alfWSSD*/)
           {
-            alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
+            alf_covariance[class_idx].y[k][b] += weight * (e_local[k][b] * (double)y_local);
           }
           else
           {
-            alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+            alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
           }
         }
       }

From 671497326453354bd0851a72c693d897f9a78da8 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 26 Aug 2021 18:05:06 +0300
Subject: [PATCH 11/13] [alf] Change _mm_store_si128 to _mm_storeu_si128 in
 alf_get_blk_stats_avx2()

---
 src/strategies/avx2/alf-avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index 1048e041..e0ad40a4 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -255,7 +255,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
         __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
         __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
-        _mm_store_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
+        _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
       }
       alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
     }

From fdf125f406b73bdfa8cecf28d843f9f02d02aa2d Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 27 Aug 2021 10:25:20 +0300
Subject: [PATCH 12/13] [alf] Fix incorrect conversion in
 alf_get_blk_stats_avx2

---
 src/strategies/avx2/alf-avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index e0ad40a4..1d4a8e9c 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -252,7 +252,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         }*/
         
         __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
-        __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
+        __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
         __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
         __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
         _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));

From 26f18865f7b6600ed811580c365b41469a45820f Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 27 Aug 2021 13:40:28 +0300
Subject: [PATCH 13/13] [alf] Change the processing in alf_get_blk_stats_avx2()
 to allow utilizing the whole 256bit register

---
 src/strategies/avx2/alf-avx2.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
index 1d4a8e9c..e832eb48 100644
--- a/src/strategies/avx2/alf-avx2.c
+++ b/src/strategies/avx2/alf-avx2.c
@@ -249,14 +249,23 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
         for (int b = 0; b < 4; b++)
         {
           alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
-        }*/
+        }*/       
         
-        __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
+      }
+      for (int k = 0; k < num_coeff-1; k+=2)
+      {
+        __m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]);
         __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
         __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
-        __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
-        _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
+        __m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]);
+        _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig));
       }
+      __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]);
+      __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
+      __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
+      __m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]);
+      _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig));
+
       alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
     }
     org += org_stride;