Math Library

qMath Functions

Apart from the standard arithmatic blocks such as +, -, *, \ and % quadric C++ also supports special functions in the qMath namespace such as

constexpr std::int32_t log2(std::int32_t n)

Computes compile time log2 function of integer.

Parameters
  • [in] n: The input value.

template<FracRepType numFracBits, typename T>
INLINE qVar_t<FixedPoint<T, numFracBits>> log(const qVar_t<FixedPoint<T, numFracBits>> &inp)

Computes natural logarithm of input qVar. The expected input type is signed 32-bit fixed-point with 16 bits for integer part and 16 bits for fractional part. The output will be generated in the same fashion.

https://www.quinapalus.com/efunc.html

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part.

  • numFracBits: Number of bits used for fractional part.

Parameters
  • [in] inp: The input qVar. Input type is 32-bit fixed point.

    *  Error Table:
    *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute Error
    *             5               |                     95.8                                        |         1.40e-1
    *             6               |                     97.4                                        |         9.18e-2
    *             7               |                     95.4                                        |         5.25e-3
    *             8               |                     88.3                                        |         2.49e-2
    *             9               |                     63.4                                        |         1.04e-2
    *            10               |                     35.1                                        |         4.78e-3
    *            11               |                     2.84                                        |         2.10e-3
    *            12               |                     0.018                                       |         9.13e-4
    *            13               |                     0.00200                                     |         4.02e-4
    *            14               |                     0.000754                                    |         1.79e-4
    *            15               |                     0.000920                                    |         8.92e-5
    *            16               |                     0.00204                                     |         7.77e-5
    *            17               |                     0.00214                                     |         5.02e-5
    *            18               |                     0.00261                                     |         4.08e-5
    *            19               |                     0.00596                                     |         3.67e-5
    *            20               |                     0.0101                                      |         3.29e-5
    *            21               |                     0.0202                                      |         3.19e-5
    *            22               |                     0.0389                                      |         3.09e-5
    *            23               |                     0.0763                                      |         3.09e-5
    *            24               |                     0.152                                       |         3.05e-5
    *            25               |                     0.302                                       |         3.05e-5
    *            26               |                     0.604                                       |         3.05e-5
    *            27               |                     1.21                                        |         3.05e-5
    *
    

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> exp(const qVar_t<T> &inp)

Computes Natural Exponent of input variable.

Template Parameters
  • numFracBits: Number of bits used for fractional part

  • T: Underlying type of qVar (deductible)

Parameters
  • [in] inp: The input value. Input type is 32-bit fixed point.

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> vectorMag(qVar_t<T> &x, qVar_t<T> &y)

Computes magnitude of input qVar. Ranges will vary based on inputs but it recommended to use one more more integer bit to represent the largest input.

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

Parameters
  • [in] x: The input in the x dimension (gets flipped if negative). Input type is 32-bit fixed point.

  • [in] y: The input in the y dimension. Input type is 32-bit fixed point.

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> sin(const qVar_t<T> &inp)

Computes sine of input qVar. The inputs of this should be in the range of -pi/2 and pi/2 with at least 2 integer bits. If an angle is not in this range, it is recommend to apply the following trig identities: a. if abs(theta) > (2*pi) -> theta = theta % (2*pi) b. if theta > pi -> theta = theta - (2*pi) c. theta < -pi -> theta = theta + (2*pi) d. theta > pi/2 -> sin(theta) = sin(pi - theta) e. theta < -pi/2 -> sin(theta) = sin(-pi - theta)

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

Parameters
  • [in] inp: The input theta in radians (between -pi/2 and pi/2) qVar. Input type is 32-bit fixed point. . Error Table:

    *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute
    Error
    *             2               |                     84.6                                        |         2.47e-1
    *             3               |                     92.0                                        |         1.25e-1
    *             4               |                     96.1                                        |         6.25e-2
    *             5               |                     98.0                                        |         3.12e-2
    *             6               |                     99.0                                        |         1.56e-2
    *             7               |                     98.5                                        |         7.81e-3
    *             8               |                     97.0                                        |         3.91e-3
    *             9               |                     95.5                                        |         1.95e-3
    *            10               |                     93.3                                        |         9.77e-4
    *            11               |                     87.9                                        |         4.88e-4
    *            12               |                     74.3                                        |         2.44e-4
    *            13               |                     48.0                                        |         1.22e-4
    *            14               |                     20.3                                        |         6.11e-5
    *            15               |                     10.0                                        |         3.05e-5
    *            16               |                     5.00                                        |         1.53e-5
    *            17               |                     2.40                                        |         7.66e-6
    *            18               |                     1.42                                        |         3.84e-6
    *            19               |                     0.678                                       |         1.94e-6
    *            20               |                     0.197                                       |         1.01e-6
    *            21               |                     0.197                                       |         5.36e-7
    *            22               |                     0.0480                                      |         2.98e-7
    *            23               |                     0.0480                                      |         1.79e-7
    *            24               |                     0.0256                                      |         1.79e-7
    *            25               |                     0.0220                                      |         1.49e-7
    *            26               |                     0.0203                                      |         1.27e-7
    *            27               |                     0.0195                                      |         1.19e-7
    *            28               |                     0.0193                                      |         1.19e-7
    *            29               |                     0.0193                                      |         1.19e-7
    *
    

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> cos(const qVar_t<T> &inp)

Computes cosine of input qVar. The inputs of this should be in the range of -pi/2 and pi/2 with at least 3 integer bits. If an angle is not in this range, it is recommend to apply the following trig identities: a. if abs(theta) > (2*pi) -> theta = theta % (2*pi) b. if theta > pi -> theta = theta - (2*pi) c. theta < -pi -> theta = theta + (2*pi) d. theta > pi/2 -> cos(theta) = -cos(theta - pi) e. theta < -pi/2 -> cos(theta) = -cos(theta + pi)

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

Parameters
  • [in] inp: The input theta in radians (between -pi/2 and pi/2) qVar. Input type is 32-bit fixed point. `*. Error Table:

     *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute
    Error
     *             2               |                     84.6                                        |         2.47e-1
     *             3               |                     92.0                                        |         1.25e-1
     *             4               |                     96.1                                        |         6.25e-2
     *             5               |                     98.0                                        |         3.12e-2
     *             6               |                     99.0                                        |         1.56e-2
     *             7               |                     98.5                                        |         7.81e-3
     *             8               |                     97.0                                        |         3.91e-3
     *             9               |                     95.5                                        |         1.95e-3
     *            10               |                     93.3                                        |         9.77e-4
     *            11               |                     87.9                                        |         4.88e-4
     *            12               |                     74.3                                        |         2.44e-4
     *            13               |                     48.0                                        |         1.22e-4
     *            14               |                     20.3                                        |         6.11e-5
     *            15               |                     10.0                                        |         3.05e-5
     *            16               |                     5.00                                        |         1.53e-5
     *            17               |                     2.40                                        |         7.66e-6
     *            18               |                     1.42                                        |         3.84e-6
     *            19               |                     0.678                                       |         1.94e-6
     *            20               |                     0.197                                       |         1.01e-6
     *            21               |                     0.197                                       |         5.36e-7
     *            22               |                     0.0480                                      |         2.98e-7
     *            23               |                     0.0480                                      |         1.79e-7
     *            24               |                     0.0256                                      |         1.79e-7
     *            25               |                     0.0220                                      |         1.49e-7
     *            26               |                     0.0203                                      |         1.27e-7
     *            27               |                     0.0195                                      |         1.19e-7
     *            28               |                     0.0193                                      |         1.19e-7
     *            29               |                     0.0193                                      |         1.19e-7
     *
    

template<FracRepType numFracBits, typename T>
INLINE const qVar_t<T> incrementMagnitudeByHalf(const qVar_t<T> &op)

performs fixed point increment which involves adding (1 << (numFracBits - 1)) in magnitude

Return

qVar_t<T> Fixed point result

Template Parameters
  • numFracBits: Number of Fraction bits used to represent fixed point input

Parameters
  • op: represented in fixed point

template<FracRepType numFracBits, typename T>
INLINE const qVar_t<T> stdRound(const qVar_t<T> &op)

performs fixed point round which involves adding (1 << (numFracBits - 1)) then two shifts (and negation if necessary). Rounding at 0.5 is away from 0

Return

qVar_t<T> Fixed point result

Template Parameters
  • numFracBits: Number of Fraction bits used to represent fixed point input

Parameters
  • op: represented in fixed point

template<typename DdrInOutSignalTensorShape, typename DdrWeightsTensorShape, typename qvarElemType = typename DdrWeightsTensorShape::elemType>
INLINE void fft1d(DdrInOutSignalTensorShape &ddrIn, DdrWeightsTensorShape &ddrWeights, DdrInOutSignalTensorShape &ddrOut, MemAllocator &ocmMemAlloc)

Performs a 1D N point FFT on a given DDR signal. N needs to be a power of two We maps N point FFT using Cooley–Tukey FFT algorithm. Similar to the explanation found here https://www.cs.cmu.edu/afs/andrew/scs/cs/15-463/2001/pub/www/notes/fourier/fourier.pdf The diagram below show 2 weight matrixes for a 8 point FFT.

* Below Wn = e^(-2*pi*n/N)
* +--------------------------+   +--------------------------+    +-----+
* | 1   W0                   |   | 1 W0                     |    |  a0 |
* |                          |   |                          |    |     |
* |    1   W2                |   | 1 W4                     |    |  a4 |
* |                          |   |                          |    |     |
* | 1   W4                   |   |      1 W0                |    |  a2 |
* |                          |   |                          |    |     |
* |    1   W6                |   |      1 W4                |    |  a6 |
* |                          |   |                          |    |     |
* |             1   W0       |   |           1 W0           |    |  a1 |
* |                          |   |                          |    |     |
* |                1   W2    |   |           1 W4           |    |  a5 |
* |                          |   |                          |    |     |
* |             1   W4       |   |                 ++       |    |  a3 |
* |                          |   |                          |    |     |
* |                1   W6    |   |                    ++    |    |  a7 |
* |                          |   |                          |    |     |
* +--------------------------+   +--------------------------+    +-----+
*       Weight stage 1                   Weight stage 0          input
*   W0, W2, W4, W6, W0....         W0, W4, W0, W4, W0....       a0, a4, a2 ....
*
* Weights is a sparse matrix but stored in a packed manner. There are a total of log2(inputPoint) where inputPoints =
* len(input matrix).
*
* The Diagram below shows the interaction of weights and inputs for stage 0
* +-----------+        +-----------+        +-----------+       +-----------+
* |           |        |           |        |           |       |           |
* |     a0    | <----> |     a4    |        |     a2    | <---> |    a6     |
* | a0 + W0*a4|        | a0 + W4*a4|        | a2 + W4*a6|       | a2 + W4*a6|
* |           |        |           |        |           |       |           |
* +-----------+        +-----------+        +-----------+       +-----------+
*
* The Diagram below shows the interaction of weights and inputs for stage 1
*  +-----------+        +-----------+        +-----------+       +-----------+
*  |           | <-------------------------> |           |       |           |
*  |    a0     |        |    a1     |        |    a2     |       |    a3     |
*  | a0 + W0*a2|        | a1 + W2*a3|        | a0 + W4*a2|       | a1 + W6*a3|
*  |           |        |           | <------------------------> |           |
*  +-----------+        +-----------+        +-----------+       +-----------+
*
* There will be a total of log2(inputPoint) stages and for each stage the neighbor data is 2^stage distance away.
* e.g on a 8x8 array, for stages 0-2, the neighbor data lies within the same row, stages 3-5 lies within the same
* column and anything over lies withing the same array core since the data wraps around
*
Template Parameters

template<typename DdrInOutSignalTensorShape, typename DdrWeightsTensorShape, typename qvarElemType = typename DdrWeightsTensorShape::elemType>
INLINE void fft2d(DdrInOutSignalTensorShape &ddrIn, DdrWeightsTensorShape &ddrWeights, DdrInOutSignalTensorShape &ddrOut, MemAllocator &ocmMemAlloc)

Performs a 2D N point FFT on a given a 2D DDR signal.

*           1D per row FFT                         Store the output at                   Transpose                    1D row FFT. This in essence is 1D col 
*                                                  bitreversed location                                               col FFT since we transposed it.
* +------------------------------+          +-----------------------------+       +----------------------------+       +----------------------------+
* +------------------------------+          |                             |       |                            |       +----------------------------+
* |    a0  a1  a2  a3 ...        +-----+ +->+    cc0  cc1  cc2  cc3  ...  |       |  cc0  bb0 ee0  aa0         |       |  cc0  bb0 ee0  aa0         |
* +------------------------------+     | |  |                             |       |                            |       +----------------------------+
* |    b0  b1  b2  b3 ...        +--------->+    bb0  bb1  bb2  bb3  ...  |       |  cc1  bb1 ee1  aa1         |       |  cc1  bb1 ee1  aa1         |
* +------------------------------+     | |  |                             | +---> |                     ...    | +---> +----------------------------+   +--->  Result (transposed)
* |    c0  c1  c2  c3 ...        +-------+  |    ee0  ee1  ee2  ee3  ...  |       |  cc2  bb2 ee2  aa2         |       |  cc2  bb2 ee2  aa2         |
* +------------------------------+     |    |                             |       |                            |       +----------------------------+
* |    d0  d1  d2  d3 ...        +--+  +--->+    aa0  aa1  aa2  aa3  ...  |       |  cc3  bb3 ee3  aa3         |       |  cc3  bb3 ee3  aa3         |
* +------------------------------+  |       |                             |       |                            |       +----------------------------+
* |                              |  |       |            ...              |       |  ...  ...  ...             |       |  ...  ...  ...             |
* |                              |  +------>+    dd0  dd1  dd2  dd3  ...  |       |                            |       |                            |
* |                              |          |                             |       |                            |       |                            |
* +------------------------------+          +-----------------------------+       +----------------------------+       +----------------------------+
*
Return

void

Template Parameters
  • (deduced): DdrInOutSignalTensorShape: DDR input tensor shape . Expected to be of the form <1, complexCount,inputPoints,inputPoints>. ComplexCount = 1 if only real or 2 if complex

  • (deduced): DdrWeightsTensorShape: DDR weight tensor. Expected to be of the form <1, complexCount,log2(inputPoints),inputPoints>. ComplexCount = 1 if only real or 2 if complex

  • (deduced): DdrWeightsTensorShape::elemType

Parameters
  • ddrIn: input ddr tensor

  • ddrWeights: ddr weights

  • ddrOut: output ddr containing the final result

  • ocmMemAlloc: ocm allocator to be used locally in fft to allocate ocm memory space

Examples

sin

  constexpr FracRepType            NUM_F_BITS = 22;
  qVar_t<FixedPoint32<NUM_F_BITS>> qPi        = 3.141592;

  qVar_t<FixedPoint32<NUM_F_BITS>> qAngle = 50 * qPi / 180;
  qVar_t<FixedPoint32<NUM_F_BITS>> qResult;

  qResult = qmath::sin(qAngle);  // Angle in radians {-pi , pi}

cos

  qResult = qmath::cos(qAngle);  // Angle in radians {-pi , pi}

log

  qVar_t<FixedPoint32<NUM_F_BITS>> qData = 10.8;
  qResult                                = qmath::log(qData);  // log base 10

GEMM

General Matrix Multiply, better known as GEMM, is an optimized API call to perform matrix multiplication.

template<typename AlphaType, typename OCMShapeA, typename OCMShapeB, typename OCMShapeC, typename BetaType>
void qmath::gemm(AlphaType alpha, OCMShapeA &ocmA, OCMShapeB &ocmB, BetaType beta, OCMShapeC &ocmC, OCMShapeC &ocmOut)

Calculate general matrix multiplication (gemm) out = alpha*A*B + beta*C https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3.

Template Parameters
  • AlphaType: Type for the scalar alpha

  • OCMShapeA: Type for the matrix A (In OCM)

  • OCMShapeB: Type for the matrix B (In OCM)

  • OCMShapeC: Type for the matrix C (In OCM)

  • BetaType: Type for the scalar beta

Parameters
  • alpha: scalar alpha

  • ocmA: matrix A

  • ocmB: matrix B

  • beta: scalar beta

  • ocmC: matrix C

  • ocmOut: output matrix

template<typename AlphaType, typename OCMShapeA, typename OCMShapeB, typename BetaType, typename OCMShapeC, isOcmTensor<OCMShapeA> = 0, isOcmTensor<OCMShapeB> = 0, isOcmTensor<OCMShapeC> = 0>
void qmath::gemmMatrixMulMatrixPacked(AlphaType alpha, OCMShapeA &matA, OCMShapeB &matB, BetaType beta, OCMShapeC &matC, OCMShapeC &matOut)

A specialized version of matrixMulMatrix for GEMM, utilizing packed flow may cause regression on small matrices.

Template Parameters
  • type: for the scalar alpha

  • OCMShapeA: Type for the matrix A (In OCM)

  • OCMShapeB: Type for the matrix B (In OCM)

  • OCMShapeC: Type for the matrix C (In OCM)

Parameters
  • alpha: the scalar alpha

  • matA: the first matrix, A

  • matB: the second matrix, B

  • matC: the result matrix, C

Examples

gemm()

gemm.hpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

#include <quadric/qil.h>

const std::int32_t                numFracBits = 4;
typedef FixedPoint16<numFracBits> MatT;

const std::int32_t matrixAWidth  = 1024;
const std::int32_t matrixAHeight = 1024;
const std::int32_t matrixBWidth  = 1024;

typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> DdrAInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> OcmAInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  DdrBInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  OcmBInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> DdrCInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> OcmCInOutShape;

EPU_ENTRY void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
                        DdrAInOutShape::ptrType        aPtr,
                        DdrBInOutShape::ptrType        bPtr,
                        DdrAInOutShape::UnderlyingType betaIn,
                        DdrCInOutShape::ptrType        cPtr,
                        DdrCInOutShape::ptrType        outPtr);

gemm.cpp :

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

/**
 * @file gemm.cpp
 */

#include "gemm.hpp"
#include <qmath.hpp>

void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
              DdrAInOutShape::ptrType        aPtr,
              DdrBInOutShape::ptrType        bPtr,
              DdrAInOutShape::UnderlyingType betaIn,
              DdrCInOutShape::ptrType        cPtr,
              DdrCInOutShape::ptrType        outPtr) {
  MatT alpha;
  alpha.value = alphaIn;
  MatT beta;
  beta.value = betaIn;

  DdrAInOutShape a(aPtr);
  DdrBInOutShape b(bPtr);
  DdrCInOutShape c(cPtr);
  DdrCInOutShape out(outPtr);

  MemAllocator ocmMem;

  OcmAInOutShape ocmA;
  OcmBInOutShape ocmB;
  OcmCInOutShape ocmC;
  OcmCInOutShape ocmOut;
  ocmMem.allocate(ocmA);
  ocmMem.allocate(ocmB);
  ocmMem.allocate(ocmC);
  ocmMem.allocate(ocmOut);

  memCpy(a, ocmA);
  memCpy(b, ocmB);
  memCpy(c, ocmC);

  qmath::gemm(alpha, ocmA, ocmB, beta, ocmC, ocmOut);

  memCpy(ocmOut, out);
}

gemm_host.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */
#include <quadric/host.h>
#include "gemm.hpp"

void generateOutputTensorAddToB(DdrCInOutShape& matAPtr, DdrCInOutShape& matBPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  auto matB = DdrCInOutShape::cast(matBPtr);

  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matB)[0][0][heightItr][widthItr] += (*matA)[0][0][heightItr][widthItr];
    }
  }
}

void generateOutputTensorMatrixMulMatrix(DdrAInOutShape& matAPtr, DdrBInOutShape& matBPtr, DdrCInOutShape& outPtr) {
  auto matA = DdrAInOutShape::cast(matAPtr);
  auto matB = DdrBInOutShape::cast(matBPtr);
  auto out  = DdrCInOutShape::cast(outPtr);
  clearTensor(outPtr);
  for(std::int32_t i = 0; i < DdrAInOutShape::NUM_ROWS; i++) {
    for(std::int32_t j = 0; j < DdrBInOutShape::NUM_COLS; j++) {
      if(std::is_same<MatT, FixedPoint16<numFracBits>>::value) {
        // fx16, MAC supported, use the higher precision way
        float sum = 0;
        for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
          sum += fixedToFloat<numFracBits>((*matA)[0][0][i][k]) * fixedToFloat<numFracBits>((*matB)[0][0][k][j]);
        }
        (*out)[0][0][i][j] = floatToFixed<numFracBits>(sum);
      } else {
        for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
          (*out)[0][0][i][j] += floatToFixed<numFracBits>(fixedToFloat<numFracBits>((*matA)[0][0][i][k]) *
                                                          fixedToFloat<numFracBits>((*matB)[0][0][k][j]));
        }
      }
    }
  }
}

void generateOutputTensorScalarMulMatrix(MatT alpha, DdrCInOutShape& matAPtr) {
  auto matA       = DdrCInOutShape::cast(matAPtr);
  auto floatAlpha = fixedToFloat<numFracBits>(alpha.value);
  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matA)[0][0][heightItr][widthItr] =
        floatToFixed<numFracBits>(fixedToFloat<numFracBits>((*matA)[0][0][heightItr][widthItr]) * floatAlpha);
    }
  }
}

// out = alpha*a*b + beta*c
// modifies c
void generateOutputTensorGemm(MatT            alpha,
                              DdrAInOutShape& matAPtr,
                              DdrBInOutShape& matBPtr,
                              MatT            beta,
                              DdrCInOutShape& matCPtr,
                              DdrCInOutShape& outPtr) {
  generateOutputTensorMatrixMulMatrix(matAPtr, matBPtr, outPtr);
  generateOutputTensorScalarMulMatrix(alpha, outPtr);
  generateOutputTensorScalarMulMatrix(beta, matCPtr);
  generateOutputTensorAddToB(matCPtr, outPtr);
}
HOST_MAIN(
  // clang-format off
  std::mt19937 randGen(std::random_device{}());
  std::uniform_real_distribution<float> urd(0,1);
  MatT alpha = urd(randGen);
  MatT beta = urd(randGen);

  DdrAInOutShape matAInp;
  DdrAInOutShape::allocate(matAInp);
  DdrBInOutShape matBInp;
  DdrBInOutShape::allocate(matBInp);
  DdrCInOutShape matCInp;
  DdrCInOutShape::allocate(matCInp);
  DdrCInOutShape matC2Inp;
  DdrCInOutShape::allocate(matC2Inp);

  DdrCInOutShape ddrOut;
  DdrCInOutShape::allocate(ddrOut);
  DdrCInOutShape expectedOut;
  DdrCInOutShape::allocate(expectedOut);

  populateTensorRand(matAInp, 0<<numFracBits, 1<<numFracBits);
  populateTensorRand(matBInp, 1<<numFracBits, 2<<numFracBits);
  populateTensorRand(matCInp, 1<<numFracBits, 2<<numFracBits);
  copyTensor(matCInp,matC2Inp);

  // Generate expected output.
  generateOutputTensorGemm(alpha, matAInp, matBInp, beta, matC2Inp, expectedOut);


  TensorArg<DdrAInOutShape> inputArgA{&matAInp};
  TensorArg<DdrBInOutShape> inputArgB{&matBInp};
  TensorArg<DdrCInOutShape> inputArgC{&matCInp};

  TensorArg<DdrCInOutShape> outputArg{&ddrOut, &expectedOut, FixedPoint16<numFracBits>(0.5).value};
  packageKernel(OUTPUT_PREFIX, FUNC_ARG(gemmtest), alpha.value, inputArgA, inputArgB, beta.value, inputArgC, outputArg);

  DeviceBufferRef aIn, bIn, cIn, out;

  auto device = DeviceManager().getDevice(callKernelGlobals.simConfig);
  device.loadKernel();
  device.allocateAndCopyToDevice(matAInp, aIn);
  device.allocateAndCopyToDevice(matBInp, bIn);
  device.allocateAndCopyToDevice(matCInp, cIn);
  device.allocate<DdrCInOutShape>(out);

  device.runKernel(ENTRYPOINT(gemmtest), alpha.value, aIn, bIn, beta.value, cIn, out);


  device.copyBufferFromDevice(out, ddrOut);

  auto nativeCompareVisitor = [](const auto &t1, const auto &t2) {
    return nativeCompare<DdrCInOutShape>(t1, t2, FixedPoint16<numFracBits>(0.5).value);
  };

  runtime_assert(compareTensors(ddrOut, expectedOut, nativeCompareVisitor), "tensor mismatch");
  // clang-format on
)

gemmMatrixMulMatrixPacked

gemm_packed.hpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

#include <quadric/qil.h>

typedef std::int8_t MatT;

const std::int32_t matrixAWidth  = 1024;
const std::int32_t matrixAHeight = 1024;
const std::int32_t matrixBWidth  = 1024;

typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> DdrAInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> OcmAInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  DdrBInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  OcmBInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> DdrCInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> OcmCInOutShape;

EPU_ENTRY void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
                        DdrAInOutShape::ptrType        aPtr,
                        DdrBInOutShape::ptrType        bPtr,
                        DdrAInOutShape::UnderlyingType betaIn,
                        DdrCInOutShape::ptrType        cPtr,
                        DdrCInOutShape::ptrType        outPtr);

gemm_packed.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

/**
 * @file gemm_packed.cpp
 */

#include "gemm_packed.hpp"
#include <qmath.hpp>

void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
              DdrAInOutShape::ptrType        aPtr,
              DdrBInOutShape::ptrType        bPtr,
              DdrAInOutShape::UnderlyingType betaIn,
              DdrCInOutShape::ptrType        cPtr,
              DdrCInOutShape::ptrType        outPtr) {
  MatT alpha;
  alpha = alphaIn;
  MatT beta;
  beta = betaIn;

  DdrAInOutShape a(aPtr);
  DdrBInOutShape b(bPtr);
  DdrCInOutShape c(cPtr);
  DdrCInOutShape out(outPtr);

  MemAllocator ocmMem;

  OcmAInOutShape ocmA;
  OcmBInOutShape ocmB;
  OcmCInOutShape ocmC;
  OcmCInOutShape ocmOut;
  ocmMem.allocate(ocmA);
  ocmMem.allocate(ocmB);
  ocmMem.allocate(ocmC);
  ocmMem.allocate(ocmOut);

  memCpy(a, ocmA);
  memCpy(b, ocmB);
  memCpy(c, ocmC);

  qmath::gemmMatrixMulMatrixPacked(alpha, ocmA, ocmB, beta, ocmC, ocmOut);

  memCpy(ocmOut, out);
}

gemm_packed_host.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */
#include <quadric/host.h>
#include "gemm_packed.hpp"

void generateOutputTensorAddToB(DdrCInOutShape& matAPtr, DdrCInOutShape& matBPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  auto matB = DdrCInOutShape::cast(matBPtr);

  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matB)[0][0][heightItr][widthItr] += (*matA)[0][0][heightItr][widthItr];
    }
  }
}

void generateOutputTensorMatrixMulMatrix(DdrAInOutShape& matAPtr, DdrBInOutShape& matBPtr, DdrCInOutShape& outPtr) {
  auto matA = DdrAInOutShape::cast(matAPtr);
  auto matB = DdrBInOutShape::cast(matBPtr);
  auto out  = DdrCInOutShape::cast(outPtr);
  clearTensor(outPtr);
  for(std::int32_t i = 0; i < DdrAInOutShape::NUM_ROWS; i++) {
    for(std::int32_t j = 0; j < DdrBInOutShape::NUM_COLS; j++) {
      for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
        (*out)[0][0][i][j] += (*matA)[0][0][i][k] * (*matB)[0][0][k][j];
      }
    }
  }
}

void generateOutputTensorScalarMulMatrix(MatT alpha, DdrCInOutShape& matAPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matA)[0][0][heightItr][widthItr] = (*matA)[0][0][heightItr][widthItr] * alpha;
    }
  }
}

// out = alpha*a*b + beta*c
// modifies c
void generateOutputTensorGemm(MatT            alpha,
                              DdrAInOutShape& matAPtr,
                              DdrBInOutShape& matBPtr,
                              MatT            beta,
                              DdrCInOutShape& matCPtr,
                              DdrCInOutShape& outPtr) {
  generateOutputTensorMatrixMulMatrix(matAPtr, matBPtr, outPtr);
  generateOutputTensorScalarMulMatrix(alpha, outPtr);
  generateOutputTensorScalarMulMatrix(beta, matCPtr);
  generateOutputTensorAddToB(matCPtr, outPtr);
}
HOST_MAIN(
  // clang-format off
  std::mt19937 randGen(std::random_device{}());
  std::uniform_real_distribution<float> urd(1,3);
  MatT alpha = urd(randGen);
  MatT beta = urd(randGen);

  DdrAInOutShape matAInp;
  DdrAInOutShape::allocate(matAInp);
  DdrBInOutShape matBInp;
  DdrBInOutShape::allocate(matBInp);
  DdrCInOutShape matCInp;
  DdrCInOutShape::allocate(matCInp);
  DdrCInOutShape matC2Inp;
  DdrCInOutShape::allocate(matC2Inp);

  DdrCInOutShape ddrOut;
  DdrCInOutShape::allocate(ddrOut);
  DdrCInOutShape expectedOut;
  DdrCInOutShape::allocate(expectedOut);

  populateTensorRand(matAInp, 0, 2);
  populateTensorRand(matBInp, 0, 2);
  populateTensorRand(matCInp, 0, 2);
  copyTensor(matCInp,matC2Inp);

  // Generate expected output.
  generateOutputTensorGemm(alpha, matAInp, matBInp, beta, matC2Inp, expectedOut);

  TensorArg<DdrAInOutShape> inputArgA{&matAInp};
  TensorArg<DdrBInOutShape> inputArgB{&matBInp};
  TensorArg<DdrCInOutShape> inputArgC{&matCInp};

  TensorArg<DdrCInOutShape> outputArg{&ddrOut, &expectedOut};
  packageKernel(OUTPUT_PREFIX, FUNC_ARG(gemmtest), alpha, inputArgA, inputArgB, beta, inputArgC, outputArg);

  DeviceBufferRef aIn, bIn, cIn, out;

  auto device = DeviceManager().getDevice(callKernelGlobals.simConfig);
  device.loadKernel();
  device.allocateAndCopyToDevice(matAInp, aIn);
  device.allocateAndCopyToDevice(matBInp, bIn);
  device.allocateAndCopyToDevice(matCInp, cIn);
  device.allocate<DdrCInOutShape>(out);

  device.runKernel(ENTRYPOINT(gemmtest), alpha, aIn, bIn, beta, cIn, out);

  device.copyBufferFromDevice(out, ddrOut);

  auto nativeCompareVisitor = [](const auto &t1, const auto &t2) {
    return nativeCompare<DdrCInOutShape>(t1, t2);
  };

  runtime_assert(compareTensors(ddrOut, expectedOut, nativeCompareVisitor), "Tensors did not match");
  // clang-format on
)