Processing math: 100%

Web Tool

This is a prototype implementation of the sparse tensor algebra compiler theory and contains known bugs, which are documented here. If you find additional issues, please consider submitting a bug report.

Input a tensor algebra expression in index notation to generate code that computes it:
  • SpMV
    SpMV:  y(i) = A(i,j) * x(j)
  • SpGEMM
    SpGEMM:  A(i,j) = B(i,k) * C(k,j)
  • Sparse matrix addition
    Sparse matrix addition:  A(i,j) = B(i,j) + C(i,j)
  • Sparse tensor-times-vector
    Sparse tensor-times-vector:  A(i,j) = B(i,j,k) * c(k)
  • SpMTTKRP
    SpMTTKRP:  A(i,j) = B(i,k,l) * D(l,j) * C(k,j)
Tensor
Format
(Level Formats)
y
A
x
Scheduling Command
Arguments
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s="split(i,i0,i1,32)" -s="reorder(i0,i1,j)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c

int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  int* restrict A2_crd = (int*)(A->indices[1][1]);
  double* restrict A_vals = (double*)(A->vals);
  int x1_dimension = (int)(x->dimensions[0]);
  double* restrict x_vals = (double*)(x->vals);

  #pragma omp parallel for schedule(runtime)
  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
    for (int32_t i1 = 0; i1 < 32; i1++) {
      int32_t i = i0 * 32 + i1;
      if (i >= A1_dimension)
        continue;

      double tjy_val = 0.0;
      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
        int32_t j = A2_crd[jA];
        tjy_val += A_vals[jA] * x_vals[j];
      }
      y_vals[i] = tjy_val;
    }
  }
  return 0;
}
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s="split(i,i0,i1,32)" -s="reorder(i0,i1,j)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c

int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);

  y_vals = (double*)malloc(sizeof(double) * y1_dimension);

  y->vals = (uint8_t*)y_vals;
  return 0;
}
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s="split(i,i0,i1,32)" -s="reorder(i0,i1,j)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
#ifndef TACO_C_HEADERS
#define TACO_C_HEADERS
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <complex.h>
#include <string.h>
#if _OPENMP
#include <omp.h>
#endif
#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))
#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)
#ifndef TACO_TENSOR_T_DEFINED
#define TACO_TENSOR_T_DEFINED
typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;
typedef struct {
  int32_t      order;         // tensor order (number of modes)
  int32_t*     dimensions;    // tensor dimensions
  int32_t      csize;         // component size
  int32_t*     mode_ordering; // mode storage ordering
  taco_mode_t* mode_types;    // mode storage types
  uint8_t***   indices;       // tensor index data (per mode)
  uint8_t*     vals;          // tensor values
  uint8_t*     fill_value;    // tensor fill value
  int32_t      vals_size;     // values array size
} taco_tensor_t;
#endif
#if !_OPENMP
int omp_get_thread_num() { return 0; }
int omp_get_max_threads() { return 1; }
#endif
int cmp(const void *a, const void *b) {
  return *((const int*)a) - *((const int*)b);
}
int taco_gallop(int *array, int arrayStart, int arrayEnd, int target) {
  if (array[arrayStart] >= target || arrayStart >= arrayEnd) {
    return arrayStart;
  }
  int step = 1;
  int curr = arrayStart;
  while (curr + step < arrayEnd && array[curr + step] < target) {
    curr += step;
    step = step * 2;
  }

  step = step / 2;
  while (step > 0) {
    if (curr + step < arrayEnd && array[curr + step] < target) {
      curr += step;
    }
    step = step / 2;
  }
  return curr+1;
}
int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {
  if (array[arrayStart] >= target) {
    return arrayStart;
  }
  int lowerBound = arrayStart; // always < target
  int upperBound = arrayEnd; // always >= target
  while (upperBound - lowerBound > 1) {
    int mid = (upperBound + lowerBound) / 2;
    int midValue = array[mid];
    if (midValue < target) {
      lowerBound = mid;
    }
    else if (midValue > target) {
      upperBound = mid;
    }
    else {
      return mid;
    }
  }
  return upperBound;
}
int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {
  if (array[arrayEnd] <= target) {
    return arrayEnd;
  }
  int lowerBound = arrayStart; // always <= target
  int upperBound = arrayEnd; // always > target
  while (upperBound - lowerBound > 1) {
    int mid = (upperBound + lowerBound) / 2;
    int midValue = array[mid];
    if (midValue < target) {
      lowerBound = mid;
    }
    else if (midValue > target) {
      upperBound = mid;
    }
    else {
      return mid;
    }
  }
  return lowerBound;
}
taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,
                                  int32_t* dimensions, int32_t* mode_ordering,
                                  taco_mode_t* mode_types) {
  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));
  t->order         = order;
  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));
  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));
  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));
  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));
  t->csize         = csize;
  for (int32_t i = 0; i < order; i++) {
    t->dimensions[i]    = dimensions[i];
    t->mode_ordering[i] = mode_ordering[i];
    t->mode_types[i]    = mode_types[i];
    switch (t->mode_types[i]) {
      case taco_mode_dense:
        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));
        break;
      case taco_mode_sparse:
        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));
        break;
    }
  }
  return t;
}
void deinit_taco_tensor_t(taco_tensor_t* t) {
  for (int i = 0; i < t->order; i++) {
    free(t->indices[i]);
  }
  free(t->indices);
  free(t->dimensions);
  free(t->mode_ordering);
  free(t->mode_types);
  free(t);
}
#endif

int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  int* restrict A2_crd = (int*)(A->indices[1][1]);
  double* restrict A_vals = (double*)(A->vals);
  int x1_dimension = (int)(x->dimensions[0]);
  double* restrict x_vals = (double*)(x->vals);

  #pragma omp parallel for schedule(runtime)
  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
    for (int32_t i1 = 0; i1 < 32; i1++) {
      int32_t i = i0 * 32 + i1;
      if (i >= A1_dimension)
        continue;

      double tjy_val = 0.0;
      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
        int32_t j = A2_crd[jA];
        tjy_val += A_vals[jA] * x_vals[j];
      }
      y_vals[i] = tjy_val;
    }
  }
  return 0;
}

int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);

  y_vals = (double*)malloc(sizeof(double) * y1_dimension);

  y->vals = (uint8_t*)y_vals;
  return 0;
}

int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  int* restrict A2_crd = (int*)(A->indices[1][1]);
  double* restrict A_vals = (double*)(A->vals);
  int x1_dimension = (int)(x->dimensions[0]);
  double* restrict x_vals = (double*)(x->vals);

  int32_t y_capacity = y1_dimension;
  y_vals = (double*)malloc(sizeof(double) * y_capacity);

  #pragma omp parallel for schedule(runtime)
  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
    for (int32_t i1 = 0; i1 < 32; i1++) {
      int32_t i = i0 * 32 + i1;
      if (i >= A1_dimension)
        continue;

      double tjy_val = 0.0;
      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
        int32_t j = A2_crd[jA];
        tjy_val += A_vals[jA] * x_vals[j];
      }
      y_vals[i] = tjy_val;
    }
  }

  y->vals = (uint8_t*)y_vals;
  return 0;
}

/*
 * The `pack` functions convert coordinate and value arrays in COO format,
 * with nonzeros sorted lexicographically by their coordinates, to the
 * specified input format.
 *
 * The `unpack` function converts the specified output format to coordinate
 * and value arrays in COO format.
 *
 * For both, the `_COO_pos` arrays contain two elements, where the first is 0
 * and the second is the number of nonzeros in the tensor.
 */

int pack_A(taco_tensor_t *A, int* A_COO1_pos, int* A_COO1_crd, int* A_COO2_crd, double* A_COO_vals) {
  int A1_dimension = (int)(A->dimensions[0]);
  int* restrict A2_pos = (int*)(A->indices[1][0]);
  int* restrict A2_crd = (int*)(A->indices[1][1]);
  double* restrict A_vals = (double*)(A->vals);

  A2_pos = (int32_t*)malloc(sizeof(int32_t) * (A1_dimension + 1));
  A2_pos[0] = 0;
  for (int32_t pA2 = 1; pA2 < (A1_dimension + 1); pA2++) {
    A2_pos[pA2] = 0;
  }
  int32_t A2_crd_size = 1048576;
  A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_crd_size);
  int32_t jA = 0;
  int32_t A_capacity = 1048576;
  A_vals = (double*)malloc(sizeof(double) * A_capacity);

  int32_t iA_COO = A_COO1_pos[0];
  int32_t pA_COO1_end = A_COO1_pos[1];

  while (iA_COO < pA_COO1_end) {
    int32_t i = A_COO1_crd[iA_COO];
    int32_t A_COO1_segend = iA_COO + 1;
    while (A_COO1_segend < pA_COO1_end && A_COO1_crd[A_COO1_segend] == i) {
      A_COO1_segend++;
    }
    int32_t pA2_begin = jA;

    int32_t jA_COO = iA_COO;

    while (jA_COO < A_COO1_segend) {
      int32_t j = A_COO2_crd[jA_COO];
      double A_COO_val = A_COO_vals[jA_COO];
      jA_COO++;
      while (jA_COO < A_COO1_segend && A_COO2_crd[jA_COO] == j) {
        A_COO_val += A_COO_vals[jA_COO];
        jA_COO++;
      }
      if (A_capacity <= jA) {
        A_vals = (double*)realloc(A_vals, sizeof(double) * (A_capacity * 2));
        A_capacity *= 2;
      }
      A_vals[jA] = A_COO_val;
      if (A2_crd_size <= jA) {
        A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2));
        A2_crd_size *= 2;
      }
      A2_crd[jA] = j;
      jA++;
    }

    A2_pos[i + 1] = jA - pA2_begin;
    iA_COO = A_COO1_segend;
  }

  int32_t csA2 = 0;
  for (int32_t pA20 = 1; pA20 < (A1_dimension + 1); pA20++) {
    csA2 += A2_pos[pA20];
    A2_pos[pA20] = csA2;
  }

  A->indices[1][0] = (uint8_t*)(A2_pos);
  A->indices[1][1] = (uint8_t*)(A2_crd);
  A->vals = (uint8_t*)A_vals;
  return 0;
}

int pack_x(taco_tensor_t *x, int* x_COO1_pos, int* x_COO1_crd, double* x_COO_vals) {
  int x1_dimension = (int)(x->dimensions[0]);
  double* restrict x_vals = (double*)(x->vals);

  int32_t x_capacity = x1_dimension;
  x_vals = (double*)malloc(sizeof(double) * x_capacity);

  #pragma omp parallel for schedule(static)
  for (int32_t px = 0; px < x_capacity; px++) {
    x_vals[px] = 0.0;
  }

  int32_t jx_COO = x_COO1_pos[0];
  int32_t px_COO1_end = x_COO1_pos[1];

  while (jx_COO < px_COO1_end) {
    int32_t j = x_COO1_crd[jx_COO];
    double x_COO_val = x_COO_vals[jx_COO];
    jx_COO++;
    while (jx_COO < px_COO1_end && x_COO1_crd[jx_COO] == j) {
      x_COO_val += x_COO_vals[jx_COO];
      jx_COO++;
    }
    x_vals[j] = x_COO_val;
  }

  x->vals = (uint8_t*)x_vals;
  return 0;
}

int unpack(int** y_COO1_pos_ptr, int** y_COO1_crd_ptr, double** y_COO_vals_ptr, taco_tensor_t *y) {
  int* y_COO1_pos;
  int* y_COO1_crd;
  double* y_COO_vals;
  int y1_dimension = (int)(y->dimensions[0]);
  double* restrict y_vals = (double*)(y->vals);

  y_COO1_pos = (int32_t*)malloc(sizeof(int32_t) * 2);
  y_COO1_pos[0] = 0;
  int32_t y_COO1_crd_size = 1048576;
  y_COO1_crd = (int32_t*)malloc(sizeof(int32_t) * y_COO1_crd_size);
  int32_t iy_COO = 0;
  int32_t y_COO_capacity = 1048576;
  y_COO_vals = (double*)malloc(sizeof(double) * y_COO_capacity);


  for (int32_t i = 0; i < y1_dimension; i++) {
    if (y_COO_capacity <= iy_COO) {
      y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * (y_COO_capacity * 2));
      y_COO_capacity *= 2;
    }
    y_COO_vals[iy_COO] = y_vals[i];
    if (y_COO1_crd_size <= iy_COO) {
      y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * (y_COO1_crd_size * 2));
      y_COO1_crd_size *= 2;
    }
    y_COO1_crd[iy_COO] = i;
    iy_COO++;
  }

  y_COO1_pos[1] = iy_COO;

  *y_COO1_pos_ptr = y_COO1_pos;
  *y_COO1_crd_ptr = y_COO1_crd;
  *y_COO_vals_ptr = y_COO_vals;
  return 0;
}

The generated code is provided "as is" without warranty of any kind. To help us improve TACO, we keep an anonymized record of all requests submitted to the TACO online server.