#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <elcore50-matrix-lib/mat_mul_with_dma_fl32.hpp>

const int cols = 128;
const int rows = 128;

float src0[rows * cols];
float src1[rows * cols];
float dst[rows * cols];

void print_usage() {
  printf("Usage: test_matmul [OPTIONS]\n");
  printf("Options description:\n");
  printf("\t-i    Number of iterations: unsigned long long. Default: 1\n");
  printf("\t      For endless run use 0\n");
  printf("\t-h    Print usage\n");
}

int main(int argc, char *argv[]) {
  if (argc > 3) {
    printf("Error: wrong number of parameters: %d!\n", argc);
    print_usage();
    return 1;
  }

  unsigned long long iterations = 1;
  if (argc > 1) {
    char *key = argv[1];
    if (!strcmp(key, "-h")) {
      print_usage();
      return 0;
    } else if (!strcmp(key, "-i")) {
      if (atoi(argv[2]) < 0) {
        printf("Error: wrong number of iterations: %s!\n", argv[2]);
        print_usage();
        return 1;
      }
      iterations = atoi(argv[2]);
    } else {
      printf("Unrecognized program option: %s\n", key);
      print_usage();
      return 1;
    }
  }

  for (size_t i = 0; i < rows; ++i)
    for (size_t j = 0; j < cols; ++j) src0[i * cols + j] = src1[i * cols + j] = fabs(sinf(i + j));

  const float control_sum = 850749.70397;

  int tics[6];
  int instr[6];

  unsigned long long it = 0;
  while (true) {
    mm_v0_vliw_1_sub_matrix_pre_load_real_out_offset((float *)src0, rows, cols, (float *)src1, cols, dst, 0, 0, 0,
                                                     cols, tics, instr, 0, 0);

    if (++it == iterations) break;

    if (it % 500 != 0) continue;

    float res_control_sum = 0.0f;
    for (size_t i = 0; i < cols * rows; ++i) res_control_sum += dst[i];

    if (fabs(res_control_sum - control_sum) > 5.0f) {
      printf("error %f != %f\n", res_control_sum, control_sum);
      return 1;
    }
  }

  printf("Test passed!\n");

  return 0;
}
