#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_LIBFFTW3
#include <fftw3.h>
#endif

#include <vector>
#include <mutex>

#include "cdo_output.h"
#include "cdo_options.h"
#include "cdo_fctrans.h"
#include "cimdOmp.h"

static std::mutex fftw_mutex;

void
fc2gp(double *fc, double *gp, long nlat, long nlon, long nfc)
{
#ifdef HAVE_LIBFFTW3
  struct FourierMemory
  {
    fftw_complex *in_fft;
    double *out_fft;
    fftw_plan plan;
  };

  std::vector<FourierMemory> ompmem(Threading::ompNumThreads);

  for (int i = 0; i < Threading::ompNumThreads; i++)
    {
      ompmem[i].in_fft = fftw_alloc_complex(nlon / 2 + 1);
      ompmem[i].out_fft = (double *) fftw_malloc(nlon * sizeof(double));
      std::unique_lock<std::mutex> locked_mutex(fftw_mutex);
      ompmem[i].plan = fftw_plan_dft_c2r_1d(nlon, ompmem[i].in_fft, ompmem[i].out_fft, FFTW_ESTIMATE);
    }

  if (Options::cdoVerbose) fftw_print_plan(ompmem[0].plan);

#ifdef _OPENMP
#pragma omp parallel for default(shared)
#endif
  for (long ilat = 0; ilat < nlat; ++ilat)
    {
      const auto ompthID = cdo_omp_get_thread_num();
      auto in_fft = ompmem[ompthID].in_fft;
      auto out_fft = ompmem[ompthID].out_fft;

      for (long ifc = 0; ifc < nfc / 2; ++ifc)
        {
          in_fft[ifc][0] = fc[2 * ifc * nlat + ilat];
          in_fft[ifc][1] = fc[(2 * ifc + 1) * nlat + ilat];
        }
      for (long ifc = nfc / 2; ifc < (nlon / 2 + 1); ++ifc)
        {
          in_fft[ifc][0] = 0;
          in_fft[ifc][1] = 0;
        }

      fftw_execute(ompmem[ompthID].plan);

      for (long ilon = 0; ilon < nlon; ++ilon) gp[ilat * nlon + ilon] = out_fft[ilon];
    }

  for (int i = 0; i < Threading::ompNumThreads; i++)
    {
      fftw_free(ompmem[i].in_fft);
      fftw_free(ompmem[i].out_fft);
      fftw_destroy_plan(ompmem[i].plan);
    }
#else
  cdoAbort("FFTW support not compiled in!");
#endif
}

void
gp2fc(const double *gp, double *fc, long nlat, long nlon, long nfc)
{
#ifdef HAVE_LIBFFTW3
  double norm = 1. / nlon;
  struct FourierMemory
  {
    double *in_fft;
    fftw_complex *out_fft;
    fftw_plan plan;
  };

  std::vector<FourierMemory> ompmem(Threading::ompNumThreads);

  for (int i = 0; i < Threading::ompNumThreads; i++)
    {
      ompmem[i].in_fft = (double *) fftw_malloc(nlon * sizeof(double));
      ompmem[i].out_fft = fftw_alloc_complex(nlon / 2 + 1);
      std::unique_lock<std::mutex> locked_mutex(fftw_mutex);
      ompmem[i].plan = fftw_plan_dft_r2c_1d(nlon, ompmem[i].in_fft, ompmem[i].out_fft, FFTW_ESTIMATE);
    }

  if (Options::cdoVerbose) fftw_print_plan(ompmem[0].plan);

#ifdef _OPENMP
#pragma omp parallel for default(shared)
#endif
  for (long ilat = 0; ilat < nlat; ++ilat)
    {
      const auto ompthID = cdo_omp_get_thread_num();
      auto in_fft = ompmem[ompthID].in_fft;
      auto out_fft = ompmem[ompthID].out_fft;

      for (long ilon = 0; ilon < nlon; ++ilon) in_fft[ilon] = gp[ilat * nlon + ilon];

      fftw_execute(ompmem[ompthID].plan);

      for (long ifc = 0; ifc < nfc / 2; ++ifc)
        {
          fc[2 * ifc * nlat + ilat] = norm * out_fft[ifc][0];
          fc[(2 * ifc + 1) * nlat + ilat] = norm * out_fft[ifc][1];
        }
    }

  for (int i = 0; i < Threading::ompNumThreads; i++)
    {
      fftw_free(ompmem[i].in_fft);
      fftw_free(ompmem[i].out_fft);
      fftw_destroy_plan(ompmem[i].plan);
    }
#else
  cdoAbort("FFTW support not compiled in!");
#endif
}
