/*
 * Performing voxel binning for very large volumes, aplying one of the functions defined therein.
 * This file is part of ufo-serge filter set.
 * Copyright (C) 2018 Serge Cohen
 *
 * This library is free software: you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation, either
 * version 3 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Serge Cohen <serge.cohen@synchrotron-soleil.fr>
 */

/*
 * We are defining the following kernels :
 * min : each box is evaluated to the minimal value found in the box
 * max : each box is evaluated to the maximal value found in the box
 * average : each box is evaluated to the average of the box's values
 * range : each box is evaluated to the range of the box's values
 * var : each box is evaluated to the variance of the box's values
 *
 * Later on, we may propose extra functions/kernels :
 * grad_x : the x component of the gradient in the box
 * grad_y : the y component of the gradient in the box
 * grad_z : the z component of the gradient in the box
 * abs_grad : the norm of the gradient in the box
 * div_grad : the divergence of the gradient in the box
 *
 * Maybe later it might be useful to extend to the possibility to get
 * multiple of these values (like the 3 coordinates of the gradient
 * at once, rather than having to compute each one separately).
 */

// Defining some constant to (as options to the compilation) :
// EDGE : the size of the box edge size, in pixel/voxel count

/*
 * This OpenCL source file contains kernels that are used in the voxel
 * binning task of UFO. For each input frame the `input kernel` is called.
 * Every EDGE frames, the `output kernel` is called to generate the
 * resulting (binned) frame. It is important that these all have the same
 * signature :
 *
 * `input kernel` : One work-item per resulting/output voxel
 * float *iFrame, : the current input frame to be processed
 * float *ioBuf1,
 * float *ioBuf2  : both same dimension as output frame, hold intermediate results
 * uint iIndex    : the index of the frame within the boxin/binned box (from 0 to EDGE-1)
 * uint iXSize    : the X-size / width of iFrame
 * uint iYSize    : the Y-size / height of iFrame
 *
 * `output kernel` :
 * float *oImage,  : the output image, generated by the kernel
 * float *iBuf1,
 * float *iBuf2,   : the object ioBuf1, ioBuf2 of the `input kernel`
 * uint iNumFrame  : the number of frames used to compute this reduction (EDGE, except for the last call, might be incomplete).
 * uint iXSize     : the X-size / width of iFrame
 * uint iYSize     : the Y-size / height of iFrame
 *
 * NB : these last two are required in the reduce kernel so that the kernel can compute precisely how many voxels were processed
 * within the resulting bounding box.
 */


/************************************/
/**   Min, max and range binning   **/

kernel void
range_k(
        global float *iFrame,
        global float *ioMin,
        global float *ioMax,
        const uint iIndex,
        const uint iXSize,
        const uint iYSize
        )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  // Getting the base position of pixels, and the extent...
  size_t startX = globalX * EDGE;
  size_t startY = globalY * EDGE;
  size_t endX = min((size_t)(iXSize), startX+EDGE);
  size_t endY = min((size_t)(iYSize), startY+EDGE);

  // Initialisation of the min/max looping
  float min_v, max_v;
  if ( 0 == iIndex ) {
    min_v = max_v = iFrame[startX + iXSize*startY];
  }
  else {
    min_v = ioMin[globalX + globalY*oStride];
    max_v = ioMax[globalX + globalY*oStride];
  }

  // For loop, going through all the voxels of the box :
  size_t ind_line = startY * iXSize;
  for ( size_t iiY=startY; endY != iiY; ++iiY ) {
    size_t ind_col = ind_line + startX;
    for ( size_t iiX=startX; endX != iiX; ++iiX ) {
      min_v = fmin(min_v, iFrame[ind_col]);
      max_v = fmax(max_v, iFrame[ind_col]);
      ++ind_col;
    }
    ind_line += iYSize;
  }

  // Saving results for future reference :
  ioMin[globalY*oStride + globalX] = min_v;
  ioMax[globalY*oStride + globalX] = max_v;
}

kernel void
min_red_k(
          global float *oImage,
          global float *iMin,
          global float *iMax,
          const uint iNumFrame,
          const uint iXSize,
          const uint iYSize
          )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  oImage[globalY*oStride + globalX] = iMin[globalY*oStride + globalX];
}

kernel void
max_red_k(
          global float *oImage,
          global float *iMin,
          global float *iMax,
          const uint iNumFrame,
          const uint iXSize,
          const uint iYSize
          )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  oImage[globalY*oStride + globalX] = iMax[globalY*oStride + globalX];
}

kernel void
range_red_k(
	    global float *oImage,
	    global float *iMin,
	    global float *iMax,
	    const uint iNumFrame,
	    const uint iXSize,
	    const uint iYSize
	    )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  oImage[globalY*oStride + globalX] = iMax[globalY*oStride + globalX] - iMin[globalY*oStride + globalX];
}

/**   Min, max and range binning   **/
/************************************/


/**************************************/
/**   Average and variance binning   **/

kernel void
ave_var_k(
	  global float *iFrame,
	  global float *ioSum,
	  global float *ioSumSq,
	  const uint iIndex,
	  const uint iXSize,
	  const uint iYSize
	  )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  // Getting the base position of pixels, and the extent...
  size_t startX = globalX * EDGE;
  size_t startY = globalY * EDGE;
  size_t endX = min((size_t)(iXSize), startX+EDGE);
  size_t endY = min((size_t)(iYSize), startY+EDGE);

  // Initialisation of the sum and sum of sq looping
  float sum_v, sum_sq_v;
  if ( 0 == iIndex ) {
    sum_v = iFrame[startX + iXSize*startY];
    sum_sq_v = (iFrame[startX + iXSize*startY] * iFrame[startX + iXSize*startY]);
  }
  else {
    sum_v = ioSum[globalX + globalY*oStride];
    sum_sq_v = ioSumSq[globalX + globalY*oStride];
  }

  // For loop, going through all the voxels of the box :
  size_t ind_line = startY * iXSize;
  for ( size_t iiY=startY; endY != iiY; ++iiY ) {
    size_t ind_col = ind_line + startX;
    for ( size_t iiX=startX; endX != iiX; ++iiX ) {
      sum_v += iFrame[ind_col];
      sum_sq_v = fma(iFrame[ind_col], iFrame[ind_col], sum_sq_v);
      ++ind_col;
    }
    ind_line += iYSize;
  }

  // Saving results for future reference :
  ioSum[globalY*oStride + globalX] = sum_v;
  ioSumSq[globalY*oStride + globalX] = sum_sq_v;
}

kernel void
ave_red_k(
          global float *oImage,
          global float *iSum,
          global float *iSumSq,
          const uint iNumFrame,
          const uint iXSize,
          const uint iYSize
          )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  // Computing the number of voxels that were summed :
  // Getting the base position of pixels, and the extent...
  size_t startX = globalX * EDGE;
  size_t startY = globalY * EDGE;
  size_t endX = min((size_t)(iXSize), startX+EDGE);
  size_t endY = min((size_t)(iYSize), startY+EDGE);

  float numVoxels = convert_float((endX - startX) * (endY - startY) * iNumFrame);

  // Computing the average :
  oImage[globalY*oStride + globalX] = iSum[globalY*oStride + globalX] / numVoxels;
}

kernel void
var_red_k(
          global float *oImage,
          global float *iSum,
          global float *iSumSq,
          const uint iNumFrame,
          const uint iXSize,
          const uint iYSize
          )
{
  size_t oStride = get_global_size(0);
  size_t globalX = get_global_id(0);
  size_t globalY = get_global_id(1);

  // Computing the number of voxels that were summed :
  // Getting the base position of pixels, and the extent...
  size_t startX = globalX * EDGE;
  size_t startY = globalY * EDGE;
  size_t endX = min((size_t)(iXSize), startX+EDGE);
  size_t endY = min((size_t)(iYSize), startY+EDGE);

  float numVoxels = convert_float((endX - startX) * (endY - startY) * iNumFrame);

  // Computing the average :
  float ave_v = iSum[globalY*oStride + globalX] / numVoxels;

  oImage[globalY*oStride + globalX] = fma(-ave_v, ave_v, iSumSq[globalY*oStride + globalX] / numVoxels);
}

/**   Average and variance binning   **/
/**************************************/
