uvg266/src/image.c

/*****************************************************************************
 * This file is part of uvg266 VVC encoder.
 *
 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or
 *   other materials provided with the distribution.
 *
 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
 ****************************************************************************/

#include "image.h"

#include <limits.h>
#include <stdlib.h>

#include "strategies/strategies-ipol.h"
#include "strategies/strategies-picture.h"
#include "threads.h"

/**
* \brief Allocate a new image with 420.
* This function signature is part of the libkvz API.
* \return image pointer or NULL on failure
*/
uvg_picture * uvg_image_alloc_420(const int32_t width, const int32_t height)
{
  return uvg_image_alloc(UVG_CSP_420, width, height);
}

/**
 * \brief Allocate a new image.
 * \return image pointer or NULL on failure
 */
uvg_picture * uvg_image_alloc(enum uvg_chroma_format chroma_format, const int32_t width, const int32_t height)
{
  //Assert that we have a well defined image
  assert((width % 2) == 0);
  assert((height % 2) == 0);

  const size_t simd_padding_width = 64;

  uvg_picture *im = MALLOC(uvg_picture, 1);
  if (!im) return NULL;

  //Add 4 pixel boundary to each side of luma for ALF
  //This results also 2 pixel boundary for chroma
  unsigned int luma_size = (width + FRAME_PADDING_LUMA) * (height + FRAME_PADDING_LUMA);

  unsigned chroma_sizes[] = { 0, luma_size / 4, luma_size / 2, luma_size };
  unsigned chroma_size = chroma_sizes[chroma_format];

  im->chroma_format = chroma_format;

  //Allocate memory, pad the full data buffer from both ends
  im->fulldata_buf = MALLOC_SIMD_PADDED(uvg_pixel, (luma_size + 2 * chroma_size), simd_padding_width * 2);
  if (!im->fulldata_buf) {
    free(im);
    return NULL;
  }

  //Shift the image to allow ALF filtering
  im->refcount = 1; //We give a reference to caller
  im->width = width;
  im->height = height;
  im->stride = width + FRAME_PADDING_LUMA;
  im->chroma_format = chroma_format;
  const int padding_before_first_pixel_luma = (FRAME_PADDING_LUMA / 2) * (im->stride) + FRAME_PADDING_LUMA / 2;
  const int padding_before_first_pixel_chroma = (FRAME_PADDING_CHROMA / 2) * (im->stride/2) + FRAME_PADDING_CHROMA / 2;
  im->fulldata = &im->fulldata_buf[padding_before_first_pixel_luma] + simd_padding_width / sizeof(uvg_pixel);
  im->base_image = im;

  im->y = im->data[COLOR_Y] = &im->fulldata[0];

  if (chroma_format == UVG_CSP_400) {
    im->u = im->data[COLOR_U] = NULL;
    im->v = im->data[COLOR_V] = NULL;
  } else {
    im->u = im->data[COLOR_U] = &im->fulldata[luma_size - padding_before_first_pixel_luma + padding_before_first_pixel_chroma];
    im->v = im->data[COLOR_V] = &im->fulldata[luma_size - padding_before_first_pixel_luma + chroma_size + padding_before_first_pixel_chroma];
  }

  im->pts = 0;
  im->dts = 0;

  im->interlacing = UVG_INTERLACING_NONE;

  im->roi.roi_array = NULL;
  im->roi.width = 0;
  im->roi.height = 0;

  return im;
}

/**
 * \brief Free an image.
 *
 * Decrement reference count of the image and deallocate associated memory
 * if no references exist any more.
 *
 * \param im image to free
 */
void uvg_image_free(uvg_picture *const im)
{
  if (im == NULL) return;

  int32_t new_refcount = UVG_ATOMIC_DEC(&(im->refcount));
  if (new_refcount > 0) {
    // There are still references so we don't free the data yet.
    return;
  }

  if (im->base_image != im) {
    // Free our reference to the base image.
    uvg_image_free(im->base_image);
  } else {
    free(im->fulldata_buf);
    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
  }

  // Make sure freed data won't be used.
  im->base_image = NULL;
  im->fulldata_buf = NULL;
  im->fulldata = NULL;
  im->y = im->u = im->v = NULL;
  im->data[COLOR_Y] = im->data[COLOR_U] = im->data[COLOR_V] = NULL;
  free(im);
}

/**
 * \brief Get a new pointer to an image.
 *
 * Increment reference count and return the image.
 */
uvg_picture *uvg_image_copy_ref(uvg_picture *im)
{
  int32_t new_refcount = UVG_ATOMIC_INC(&im->refcount);
  // The caller should have had another reference and we added one
  // reference so refcount should be at least 2.
  assert(new_refcount >= 2);
  return im;
}

uvg_picture *uvg_image_make_subimage(uvg_picture *const orig_image,
                             const unsigned x_offset,
                             const unsigned y_offset,
                             const unsigned width,
                             const unsigned height)
{
  // Assert that we have a well defined image
  assert((width % 2) == 0);
  assert((height % 2) == 0);

  assert((x_offset % 2) == 0);
  assert((y_offset % 2) == 0);

  assert(x_offset + width <= orig_image->width);
  assert(y_offset + height <= orig_image->height);

  uvg_picture *im = MALLOC(uvg_picture, 1);
  if (!im) return NULL;

  im->base_image = uvg_image_copy_ref(orig_image->base_image);
  im->refcount = 1; // We give a reference to caller
  im->width = width;
  im->height = height;
  im->stride = orig_image->stride;
  im->chroma_format = orig_image->chroma_format;

  im->y = im->data[COLOR_Y] = &orig_image->y[x_offset + y_offset * orig_image->stride];
  if (orig_image->chroma_format != UVG_CSP_400) {
    im->u = im->data[COLOR_U] = &orig_image->u[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
    im->v = im->data[COLOR_V] = &orig_image->v[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
  }

  im->pts = 0;
  im->dts = 0;

  im->roi = orig_image->roi;

  return im;
}

yuv_t * uvg_yuv_t_alloc(int luma_size, int chroma_size)
{
  yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv));
  yuv->size = luma_size;

  // Get buffers with separate mallocs in order to take advantage of
  // automatic buffer overrun checks.
  yuv->y = (uvg_pixel *)malloc(luma_size * sizeof(*yuv->y));
  if (chroma_size == 0) {
    yuv->u = NULL;
    yuv->v = NULL;
  } else {
    yuv->u = (uvg_pixel *)malloc(chroma_size * sizeof(*yuv->u));
    yuv->v = (uvg_pixel *)malloc(chroma_size * sizeof(*yuv->v));
  }

  return yuv;
}

void uvg_yuv_t_free(yuv_t *yuv)
{
  if (yuv) {
    FREE_POINTER(yuv->y);
    FREE_POINTER(yuv->u);
    FREE_POINTER(yuv->v);
  }
  FREE_POINTER(yuv);
}

hi_prec_buf_t * uvg_hi_prec_buf_t_alloc(int luma_size)
{
  // Get buffers with separate mallocs in order to take advantage of
  // automatic buffer overrun checks.
  hi_prec_buf_t *yuv = (hi_prec_buf_t *)malloc(sizeof(*yuv));
  yuv->y = (int16_t *)malloc(luma_size * sizeof(*yuv->y));
  yuv->u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
  yuv->v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
  yuv->joint_u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
  yuv->joint_v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
  yuv->size = luma_size;

  return yuv;
}

void uvg_hi_prec_buf_t_free(hi_prec_buf_t * yuv)
{
  free(yuv->y);
  free(yuv->u);
  free(yuv->v);
  free(yuv->joint_v);
  free(yuv->joint_u);
  free(yuv);
}

static INLINE uint32_t reg_sad_maybe_optimized(const uvg_pixel * const data1, const uvg_pixel * const data2,
                                  const int32_t width, const int32_t height, const uint32_t stride1,
                                  const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad)
{
  if (optimized_sad != NULL)
    return optimized_sad(data1, data2, height, stride1, stride2);
  else
    return uvg_reg_sad(data1, data2, width, height, stride1, stride2);
}

/**
 * \brief Diagonally interpolate SAD outside the frame.
 *
 * \param data1   Starting point of the first picture.
 * \param data2   Starting point of the second picture.
 * \param width   Width of the region for which SAD is calculated.
 * \param height  Height of the region for which SAD is calculated.
 * \param width  Width of the pixel array.
 *
 * \returns Sum of Absolute Differences
 */
static unsigned cor_sad(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
                        int block_width, int block_height, unsigned pic_stride)
{
  uvg_pixel ref = *ref_data;
  int x, y;
  unsigned sad = 0;

  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
      sad += abs(pic_data[y * pic_stride + x] - ref);
    }
  }

  return sad;
}


/**
 * \brief  Handle special cases of comparing blocks that are not completely
 *         inside the frame.
 *
 * \param pic  First frame.
 * \param ref  Second frame.
 * \param pic_x  X coordinate of the first block.
 * \param pic_y  Y coordinate of the first block.
 * \param ref_x  X coordinate of the second block.
 * \param ref_y  Y coordinate of the second block.
 * \param block_width  Width of the blocks.
 * \param block_height  Height of the blocks.
 */
static unsigned image_interpolated_sad(const uvg_picture *pic, const uvg_picture *ref,
                                 int pic_x, int pic_y, int ref_x, int ref_y,
                                 int block_width, int block_height,
                                 optimized_sad_func_ptr_t optimized_sad)
{
  uvg_pixel *pic_data, *ref_data;

  int left, right, top, bottom;
  int result = 0;

  // Change the movement vector to point right next to the frame. This doesn't
  // affect the result but removes some special cases.
  if (ref_x > ref->width)            ref_x = ref->width;
  if (ref_y > ref->height)           ref_y = ref->height;
  if (ref_x + block_width < 0)  ref_x = -block_width;
  if (ref_y + block_height < 0) ref_y = -block_height;

  // These are the number of pixels by how far the movement vector points
  // outside the frame. They are always >= 0. If all of them are 0, the
  // movement vector doesn't point outside the frame.
  left   = (ref_x < 0) ? -ref_x : 0;
  top    = (ref_y < 0) ? -ref_y : 0;
  right  = (ref_x + block_width  > ref->width)  ? ref_x + block_width  - ref->width  : 0;
  bottom = (ref_y + block_height > ref->height) ? ref_y + block_height - ref->height : 0;

  // Center picture to the current block and reference to the point where
  // movement vector is pointing to. That point might be outside the buffer,
  // but that is ok because we project the movement vector to the buffer
  // before dereferencing the pointer.
  pic_data = &pic->y[pic_y * pic->stride + pic_x];
  ref_data = &ref->y[ref_y * ref->stride + ref_x];

  // The handling of movement vectors that point outside the picture is done
  // in the following way.
  // - Correct the index of ref_data so that it points to the top-left
  //   of the area we want to compare against.
  // - Correct the index of pic_data to point inside the current block, so
  //   that we compare the right part of the block to the ref_data.
  // - Reduce block_width and block_height so that the the size of the area
  //   being compared is correct.
  //
  // NOTE: No more correct since hor_sad was modified to be a separate
  // strategy
  if (top && left) {
    result += cor_sad(pic_data,
                      &ref_data[top * ref->stride + left],
                      left, top, pic->stride);
    result += uvg_ver_sad(&pic_data[left],
                      &ref_data[top * ref->stride + left],
                      block_width - left, top, pic->stride);

    result += uvg_hor_sad(pic_data + top * pic->stride,
                          ref_data + top * ref->stride,
                          block_width, block_height - top,
                          pic->stride, ref->stride,
                          left, right);

  } else if (top && right) {
    result += uvg_ver_sad(pic_data,
                      &ref_data[top * ref->stride],
                      block_width - right, top, pic->stride);
    result += cor_sad(&pic_data[block_width - right],
                      &ref_data[top * ref->stride + (block_width - right - 1)],
                      right, top, pic->stride);

    result += uvg_hor_sad(pic_data + top * pic->stride,
                          ref_data + top * ref->stride,
                          block_width, block_height - top,
                          pic->stride, ref->stride,
                          left, right);

  } else if (bottom && left) {
    result += uvg_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
                          pic->stride, ref->stride, left, right);

    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
                      left, bottom, pic->stride);
    result += uvg_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
                      block_width - left, bottom, pic->stride);
  } else if (bottom && right) {
    result += uvg_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
                          pic->stride, ref->stride, left, right);

    result += uvg_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride],
                      block_width - right, bottom, pic->stride);
    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
                      &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
                      right, bottom, pic->stride);
  } else if (top) {
    result += uvg_ver_sad(pic_data,
                      &ref_data[top * ref->stride],
                      block_width, top, pic->stride);
    result += reg_sad_maybe_optimized(&pic_data[top * pic->stride],
                      &ref_data[top * ref->stride],
                      block_width, block_height - top, pic->stride, ref->stride,
                      optimized_sad);
  } else if (bottom) {
    result += reg_sad_maybe_optimized(pic_data,
                      ref_data,
                      block_width, block_height - bottom, pic->stride, ref->stride,
                      optimized_sad);
    result += uvg_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride],
                      block_width, bottom, pic->stride);
  } else if (left | right) {
    result += uvg_hor_sad(pic_data, ref_data,
                          block_width, block_height, pic->stride,
                          ref->stride, left, right);
  } else {
    result += reg_sad_maybe_optimized(pic_data, ref_data,
                                      block_width, block_height,
                                      pic->stride, ref->stride,
                                      optimized_sad);
  }
  return result;
}

/**
* \brief Calculate interpolated SAD between two blocks.
*
* \param pic        Image for the block we are trying to find.
* \param ref        Image where we are trying to find the block.
*
* \returns          Sum of absolute differences
*/
unsigned uvg_image_calc_sad(const uvg_picture *pic,
                            const uvg_picture *ref,
                            int pic_x,
                            int pic_y,
                            int ref_x,
                            int ref_y,
                            int block_width,
                            int block_height,
                            optimized_sad_func_ptr_t optimized_sad)
{
  assert(pic_x >= 0 && pic_x <= pic->width - block_width);
  assert(pic_y >= 0 && pic_y <= pic->height - block_height);

  uint32_t res;

  if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
      ref_y >= 0 && ref_y <= ref->height - block_height)
  {
    // Reference block is completely inside the frame, so just calculate the
    // SAD directly. This is the most common case, which is why it's first.
    const uvg_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
    const uvg_pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x];

    res = reg_sad_maybe_optimized(pic_data,
                                  ref_data,
                                  block_width,
                                  block_height,
                                  pic->stride,
                                  ref->stride,
                                  optimized_sad);
  } else {
    // Call a routine that knows how to interpolate pixels outside the frame.
    res = image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height, optimized_sad);
  }
  return res >> (UVG_BIT_DEPTH - 8);
}


/**
* \brief Calculate interpolated SATD between two blocks.
*
* \param pic        Image for the block we are trying to find.
* \param ref        Image where we are trying to find the block.
*/
unsigned uvg_image_calc_satd(const uvg_picture *pic,
                             const uvg_picture *ref,
                             int pic_x,
                             int pic_y,
                             int ref_x,
                             int ref_y,
                             int block_width,
                             int block_height)
{
  assert(pic_x >= 0 && pic_x <= pic->width - block_width);
  assert(pic_y >= 0 && pic_y <= pic->height - block_height);

  if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
      ref_y >= 0 && ref_y <= ref->height - block_height)
  {
    // Reference block is completely inside the frame, so just calculate the
    // SAD directly. This is the most common case, which is why it's first.
    const uvg_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
    const uvg_pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x];
    return uvg_satd_any_size(block_width,
                             block_height,
                             pic_data,
                             pic->stride,
                             ref_data,
                             ref->stride) >> (UVG_BIT_DEPTH - 8);
  } else {
    // Extrapolate pixels from outside the frame.

    // Space for extrapolated pixels and the part from the picture
    // The extrapolation function will set the pointers and stride.
    uvg_pixel ext_buffer[LCU_LUMA_SIZE];
    uvg_pixel *ext = NULL;
    uvg_pixel *ext_origin = NULL;
    int ext_s = 0;
    uvg_epol_args epol_args = {
      .src = ref->y,
      .src_w = ref->width,
      .src_h = ref->height,
      .src_s = ref->stride,
      .blk_x = ref_x,
      .blk_y = ref_y,
      .blk_w = block_width,
      .blk_h = block_height,
      .pad_l = 0,
      .pad_r = 0,
      .pad_t = 0,
      .pad_b = 0,
      .pad_b_simd = 0,
    };

    // Initialize separately. Gets rid of warning
    // about using nonstandard extension.
    epol_args.buf = ext_buffer;
    epol_args.ext = &ext;
    epol_args.ext_origin = &ext_origin;
    epol_args.ext_s = &ext_s;

    uvg_get_extended_block(&epol_args);

    const uvg_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];

    unsigned satd = uvg_satd_any_size(block_width,
      block_height,
      pic_data,
      pic->stride,
      ext_origin,
      ext_s) >> (UVG_BIT_DEPTH - 8);

    return satd;
  }
}


/**
 * \brief BLock Image Transfer from one buffer to another.
 *
 * It's a stupidly simple loop that copies pixels.
 *
 * \param orig  Start of the originating buffer.
 * \param dst  Start of the destination buffer.
 * \param width  Width of the copied region.
 * \param height  Height of the copied region.
 * \param orig_stride  Width of a row in the originating buffer.
 * \param dst_stride  Width of a row in the destination buffer.
 *
 * This should be inlined, but it's defined here for now to see if Visual
 * Studios LTCG will inline it.
 */
#define BLIT_PIXELS_CASE(n) case n:\
  for (y = 0; y < n; ++y) {\
    memcpy(&dst[y*dst_stride], &orig[y*orig_stride], n * sizeof(uvg_pixel));\
  }\
  break;

void uvg_pixels_blit(const uvg_pixel * const orig, uvg_pixel * const dst,
                         const unsigned width, const unsigned height,
                         const unsigned orig_stride, const unsigned dst_stride)
{
  unsigned y;
  //There is absolutely no reason to have a width greater than the source or the destination stride.
  assert(width <= orig_stride);
  assert(width <= dst_stride);

#ifdef CHECKPOINTS
  char *buffer = malloc((3 * width + 1) * sizeof(char));
  for (y = 0; y < height; ++y) {
    int p;
    for (p = 0; p < width; ++p) {
      sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]);
    }
    buffer[3*width] = 0;
    CHECKPOINT("uvg_pixels_blit_avx2: %04d: %s", y, buffer);
  }
  FREE_POINTER(buffer);
#endif //CHECKPOINTS

  if (width == orig_stride && width == dst_stride) {
    memcpy(dst, orig, width * height * sizeof(uvg_pixel));
    return;
  }

  int nxn_width = (width == height) ? width : 0;
  switch (nxn_width) {
    BLIT_PIXELS_CASE(4)
    BLIT_PIXELS_CASE(8)
    BLIT_PIXELS_CASE(16)
    BLIT_PIXELS_CASE(32)
    BLIT_PIXELS_CASE(64)
  default:

    if (orig == dst) {
      //If we have the same array, then we should have the same stride
      assert(orig_stride == dst_stride);
      return;
    }
    assert(orig != dst || orig_stride == dst_stride);

    for (y = 0; y < height; ++y) {
      memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(uvg_pixel));
    }
    break;
  }
}