/*****************************************************************************
 * This file is part of Kvazaar HEVC encoder.
 *
 * Copyright (C) 2013-2015 Tampere University of Technology and others (see
 * COPYING file).
 *
 * Kvazaar is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 2.1 as
 * published by the Free Software Foundation.
 *
 * Kvazaar is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
 ****************************************************************************/
 
#include "greatest/greatest.h"

#include "test_strategies.h"

#include "src/image.h"
#include "src/threads.h"

#include <math.h>
#include <stdlib.h>


//////////////////////////////////////////////////////////////////////////
// MACROS
#define NUM_TESTS 113
#define NUM_CHUNKS 36
#define LCU_MAX_LOG_W 6
#define LCU_MIN_LOG_W 2

// Time per tested function, in seconds.
#define TIME_PER_TEST 1.0

//////////////////////////////////////////////////////////////////////////
// GLOBALS
static kvz_pixel * bufs[NUM_TESTS]; // SIMD aligned pointers.
static kvz_pixel * actual_bufs[NUM_TESTS]; // pointers returned by malloc.

static struct test_env_t {
  int log_width; // for selecting dim from bufs
  void * tested_func;
  const strategy_t * strategy;
  char msg[1024];
} test_env;


//////////////////////////////////////////////////////////////////////////
// SETUP, TEARDOWN AND HELPER FUNCTIONS
static void init_gradient(int x_px, int y_px, int width, int slope, kvz_pixel *buf)
{
  for (int y = 0; y < width; ++y) {
    for (int x = 0; x < width; ++x) {
      int diff_x = x_px - x;
      int diff_y = y_px - y;
      int val = slope * sqrt(diff_x * diff_x + diff_y * diff_y) + 0.5;
      buf[y * width + x] = CLIP(0, 255, val);
    }
  }
}


static void setup_tests()
{
  for (int test = 0; test < NUM_TESTS; ++test) {
    unsigned size = NUM_CHUNKS * 64 * 64;
    
    actual_bufs[test] = malloc(size * sizeof(kvz_pixel) + SIMD_ALIGNMENT);
    bufs[test] = ALIGNED_POINTER(actual_bufs[test], SIMD_ALIGNMENT);
  }

  for (int test = 0; test < NUM_TESTS; ++test) {
    for (int chunk = 0; chunk < NUM_CHUNKS; ++chunk) {
      const int width = 64;
      int x = (test + chunk) % width;
      int y = (test + chunk) / width;
      init_gradient(width - x, y, width, 255 / width, &bufs[test][chunk * 64*64]);
    }
  }
}

static void tear_down_tests()
{
  for (int test = 0; test < NUM_TESTS; ++test) {
    free(actual_bufs[test]);
  }
}

//////////////////////////////////////////////////////////////////////////
// TESTS

TEST test_intra_speed(const int width)
{
  const int size = width * width;
  uint64_t call_cnt = 0;
  KVZ_CLOCK_T clock_now;
  KVZ_GET_TIME(&clock_now);
  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;

  // Loop until time allocated for test has passed.
  for (unsigned i = 0; 
      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
      ++i)
  {
    int test = i % NUM_TESTS;
    uint64_t sum = 0;
    for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
      // Compare the first chunk against the 35 other chunks to simulate real usage.
      kvz_pixel * buf1 = &bufs[test][offset];
      for (int chunk = 1; chunk < NUM_CHUNKS; ++chunk) {
        kvz_pixel * buf2 = &bufs[test][chunk * size + offset];

        cost_pixel_nxn_func *tested_func = test_env.tested_func;
        sum += tested_func(buf1, buf2);
        ++call_cnt;
      }
    }

    ASSERT(sum > 0);
    KVZ_GET_TIME(&clock_now)
  }

  sprintf(test_env.msg, "%.3fM x %s:%s",
    (double)call_cnt / 1000000.0,
    test_env.strategy->type,
    test_env.strategy->strategy_name);
  PASSm(test_env.msg);
}


TEST test_intra_dual_speed(const int width)
{
  const int size = width * width;
  uint64_t call_cnt = 0;
  KVZ_CLOCK_T clock_now;
  KVZ_GET_TIME(&clock_now);
  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;

  // Loop until time allocated for test has passed.
  for (unsigned i = 0;
    test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
    ++i)
  {
    int test = i % NUM_TESTS;
    uint64_t sum = 0;
    for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
      // Compare the first chunk against the 35 other chunks to simulate real usage.
      kvz_pixel * buf1 = &bufs[test][offset];
      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 2) {
        cost_pixel_nxn_multi_func *tested_func = test_env.tested_func;
        const kvz_pixel *buf_pair[2] = { &bufs[test][chunk * size + offset], &bufs[test][(chunk + 1) * size + offset] };
        unsigned costs[2] = { 0, 0 };
        tested_func((pred_buffer)buf_pair, buf1, 2, costs);
        sum += costs[0] + costs[1];
        ++call_cnt;
      }
    }

    ASSERT(sum > 0);
    KVZ_GET_TIME(&clock_now)
  }

  sprintf(test_env.msg, "%.3fM x %s:%s",
    (double)call_cnt / 1000000.0,
    test_env.strategy->type,
    test_env.strategy->strategy_name);
  PASSm(test_env.msg);
}


TEST test_inter_speed(const int width)
{
  const int size = width * width;
  unsigned call_cnt = 0;
  KVZ_CLOCK_T clock_now;
  KVZ_GET_TIME(&clock_now);
  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;

  // Loop until time allocated for test has passed.
  for (unsigned i = 0;
      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
      ++i)
  {
    int test = i % NUM_TESTS;
    uint64_t sum = 0;
    for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
      // Treat 4 consecutive chunks as one chunk with double width and height,
      // and do a 8x8 grid search against the first chunk to simulate real usage.
      kvz_pixel * buf1 = &bufs[test][offset];
      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 4) {
        kvz_pixel * buf2 = &bufs[test][chunk * size + offset];
        for (int y = 0; y < 8; ++y) {
          for (int x = 0; x < 8; ++x) {
            const int stride1 = 2 * 64;
            const int stride2 = 2 * 64;
            reg_sad_func *tested_func = test_env.tested_func;
            sum += tested_func(buf1, &buf2[y * stride2 + x], width, width, stride1, stride2);
            ++call_cnt;
          }
        }
      }
    }
    ASSERT(sum > 0);
    KVZ_GET_TIME(&clock_now)
  }

  sprintf(test_env.msg, "%.3fM x %s(%ix%i):%s",
    (double)call_cnt / 1000000.0,
    test_env.strategy->type,
    width,
    width,
    test_env.strategy->strategy_name);
  PASSm(test_env.msg);
}


TEST dct_speed(const int width)
{
  const int size = width * width;
  uint64_t call_cnt = 0;
  dct_func * tested_func = test_env.strategy->fptr;

  KVZ_CLOCK_T clock_now;
  KVZ_GET_TIME(&clock_now);
  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;

  int16_t _tmp_residual[32 * 32 + SIMD_ALIGNMENT];
  int16_t _tmp_coeffs[32 * 32 + SIMD_ALIGNMENT];
  int16_t *tmp_residual = ALIGNED_POINTER(_tmp_residual, SIMD_ALIGNMENT);
  int16_t *tmp_coeffs = ALIGNED_POINTER(_tmp_coeffs, SIMD_ALIGNMENT);
  
  // Loop until time allocated for test has passed.
  for (unsigned i = 0;
    test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
    ++i)
  {
    int test = i % NUM_TESTS;
    uint64_t sum = 0;
    for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
      // Compare the first chunk against the 35 other chunks to simulate real usage.
      for (int chunk = 0; chunk < NUM_CHUNKS; ++chunk) {
        kvz_pixel * buf1 = &bufs[test][offset];
        kvz_pixel * buf2 = &bufs[test][chunk * size + offset];
        for (int p = 0; p < size; ++p) {
          tmp_residual[p] = (int16_t)(buf1[p] - buf2[p]);
        }

        tested_func(8, tmp_residual, tmp_coeffs);
        ++call_cnt;
        sum += tmp_coeffs[0];
      }
    }

    ASSERT(sum > 0);
    KVZ_GET_TIME(&clock_now)
  }
  
  sprintf(test_env.msg, "%.3fM x %s:%s",
    (double)call_cnt / 1000000.0,
    test_env.strategy->type,
    test_env.strategy->strategy_name);
  PASSm(test_env.msg);
}


TEST intra_sad(void)
{
  const int width = 1 << test_env.log_width;
  return test_intra_speed(width);
}


TEST intra_sad_dual(void)
{
  const int width = 1 << test_env.log_width;
  return test_intra_dual_speed(width);
}


TEST intra_satd(void)
{
  const int width = 1 << test_env.log_width;
  return test_intra_speed(width);
}


TEST intra_satd_dual(void)
{
  const int width = 1 << test_env.log_width;
  return test_intra_dual_speed(width);
}


TEST inter_sad(void)
{
  const int width = 1 << test_env.log_width;
  return test_inter_speed(width);
}


TEST fdct(void)
{
  const int width = 1 << test_env.log_width;
  return dct_speed(width);
}


TEST idct(void)
{
  const int width = 1 << test_env.log_width;
  return dct_speed(width);
}


//////////////////////////////////////////////////////////////////////////
// TEST FIXTURES
SUITE(speed_tests)
{
  //SET_SETUP(sad_setup);
  //SET_TEARDOWN(sad_teardown);

  setup_tests();

  // Loop through all strategies picking out the intra sad ones and run
  // selectec strategies though all tests
  for (unsigned i = 0; i < strategies.count; ++i) {
    const strategy_t * strategy = &strategies.strategies[i];

    // Select buffer width according to function name.
    if (strstr(strategy->type, "_4x4")) {
      test_env.log_width = 2;
    } else if (strstr(strategy->type, "_8x8")) {
      test_env.log_width = 3;
    } else if (strstr(strategy->type, "_16x16")) {
      test_env.log_width = 4;
    } else if (strstr(strategy->type, "_32x32")) {
      test_env.log_width = 5;
    } else if (strstr(strategy->type, "_64x64")) {
      test_env.log_width = 6;
    } else {
      test_env.log_width = 0;
    }

    test_env.tested_func = strategies.strategies[i].fptr;
    test_env.strategy = strategy;

    // Call different tests depending on type of function.
    // This allows for selecting a subset of tests with -t parameter.
    if (strncmp(strategy->type, "satd_", 5) == 0 && strcmp(strategy->type, "satd_any_size") != 0) {
      if (strlen(strategy->type) <= 10) {
        RUN_TEST(intra_satd);
      } else if (strstr(strategy->type, "_dual")) {
        RUN_TEST(intra_satd_dual);
      }
    } else if (strncmp(strategy->type, "sad_", 4) == 0) {
      if (strlen(strategy->type) <= 9) {
        RUN_TEST(intra_sad);
      } else if (strstr(strategy->type, "_dual")) {
        RUN_TEST(intra_sad_dual);
      }
    } else if (strcmp(strategy->type, "reg_sad") == 0) {
      // Call reg_sad with all the sizes it is actually called with.
      for (int width = 3; width <= 6; ++width) {
        test_env.log_width = width;
        RUN_TEST(inter_sad);
      }
    } else if (strncmp(strategy->type, "dct_", 4) == 0 ||
               strcmp(strategy->type, "fast_forward_dst_4x4") == 0)
    {
      RUN_TEST(fdct);
    } else if (strncmp(strategy->type, "idct_", 4) == 0 ||
               strcmp(strategy->type, "fast_inverse_dst_4x4") == 0)
    {
      RUN_TEST(idct);
    }
  }

  tear_down_tests();
}