Math Library

qMath Functions

Apart from the standard arithmatic blocks such as +, -, *, \ and % quadric C++ also supports special functions in the qMath namespace such as

constexpr std::int32_t log2(std::int32_t n)

Computes compile time log2 function of integer.

Parameters
  • [in] n: The input value.

template<FracRepType numFracBits, typename T>
INLINE qVar_t<FixedPoint<T, numFracBits>> log(const qVar_t<FixedPoint<T, numFracBits>> &inp)

Computes natural logarithm of input qVar. The expected input type is signed 32-bit fixed-point with 16 bits for integer part and 16 bits for fractional part. The output will be generated in the same fashion.

https://www.quinapalus.com/efunc.html

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part.

  • numFracBits: Number of bits used for fractional part.

Parameters
  • [in] inp: The input qVar. Input type is 32-bit fixed point.

    *  Error Table:
    *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute Error
    *             5               |                     95.8                                        |         1.40e-1
    *             6               |                     97.4                                        |         9.18e-2
    *             7               |                     95.4                                        |         5.25e-3
    *             8               |                     88.3                                        |         2.49e-2
    *             9               |                     63.4                                        |         1.04e-2
    *            10               |                     35.1                                        |         4.78e-3
    *            11               |                     2.84                                        |         2.10e-3
    *            12               |                     0.018                                       |         9.13e-4
    *            13               |                     0.00200                                     |         4.02e-4
    *            14               |                     0.000754                                    |         1.79e-4
    *            15               |                     0.000920                                    |         8.92e-5
    *            16               |                     0.00204                                     |         7.77e-5
    *            17               |                     0.00214                                     |         5.02e-5
    *            18               |                     0.00261                                     |         4.08e-5
    *            19               |                     0.00596                                     |         3.67e-5
    *            20               |                     0.0101                                      |         3.29e-5
    *            21               |                     0.0202                                      |         3.19e-5
    *            22               |                     0.0389                                      |         3.09e-5
    *            23               |                     0.0763                                      |         3.09e-5
    *            24               |                     0.152                                       |         3.05e-5
    *            25               |                     0.302                                       |         3.05e-5
    *            26               |                     0.604                                       |         3.05e-5
    *            27               |                     1.21                                        |         3.05e-5
    *
    

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> exp(const qVar_t<T> &inp)

Computes Natural Exponent of input variable.

Template Parameters
  • numFracBits: Number of bits used for fractional part

  • T: Underlying type of qVar (deductible)

Parameters
  • [in] inp: The input value. Input type is 32-bit fixed point.

template<FracRepType numFracBits, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> vectorMag(qVar_t<T> &x, qVar_t<T> &y)

Computes magnitude of input qVar. Ranges will vary based on inputs but it recommended to use one more more integer bit to represent the largest input.

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

Parameters
  • [in] x: The input in the x dimension (gets flipped if negative). Input type is 32-bit fixed point.

  • [in] y: The input in the y dimension. Input type is 32-bit fixed point.

template<FracRepType numFracBits, bool transformRange = false, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> sin(const qVar_t<T> &inp)

Computes sine of input qVar.

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

  • transformRange: allows input with arbitrary value, produces additional overhead

Parameters
  • [in] inp: The input theta in radians (between -pi/2 and pi/2) qVar. Input type is 32-bit fixed point. . Error Table:

    *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute
    Error
    *             2               |                     84.6                                        |         2.47e-1
    *             3               |                     92.0                                        |         1.25e-1
    *             4               |                     96.1                                        |         6.25e-2
    *             5               |                     98.0                                        |         3.12e-2
    *             6               |                     99.0                                        |         1.56e-2
    *             7               |                     98.5                                        |         7.81e-3
    *             8               |                     97.0                                        |         3.91e-3
    *             9               |                     95.5                                        |         1.95e-3
    *            10               |                     93.3                                        |         9.77e-4
    *            11               |                     87.9                                        |         4.88e-4
    *            12               |                     74.3                                        |         2.44e-4
    *            13               |                     48.0                                        |         1.22e-4
    *            14               |                     20.3                                        |         6.11e-5
    *            15               |                     10.0                                        |         3.05e-5
    *            16               |                     5.00                                        |         1.53e-5
    *            17               |                     2.40                                        |         7.66e-6
    *            18               |                     1.42                                        |         3.84e-6
    *            19               |                     0.678                                       |         1.94e-6
    *            20               |                     0.197                                       |         1.01e-6
    *            21               |                     0.197                                       |         5.36e-7
    *            22               |                     0.0480                                      |         2.98e-7
    *            23               |                     0.0480                                      |         1.79e-7
    *            24               |                     0.0256                                      |         1.79e-7
    *            25               |                     0.0220                                      |         1.49e-7
    *            26               |                     0.0203                                      |         1.27e-7
    *            27               |                     0.0195                                      |         1.19e-7
    *            28               |                     0.0193                                      |         1.19e-7
    *            29               |                     0.0193                                      |         1.19e-7
    *
    

template<FracRepType numFracBits, bool transformRange = false, typename T, IfIntegerTy<T> = 0>
INLINE qVar_t<T> cos(const qVar_t<T> &inp)

Computes cosine of input qVar.

Template Parameters
  • T: Underlying type of qVar (deductible)

  • numIntBits: Number of bits used for integer part

  • numFracBits: Number of bits used for fractional part

  • transformRange: allows input with arbitrary value, produces additional overhead

Parameters
  • [in] inp: The input theta in radians (between -pi/2 and pi/2) qVar. Input type is 32-bit fixed point. `*. Error Table:

     *   Number of Fractional Bits | Percent of Fixed Point computes w/ rel. error greater than 1e-4 | Maximum Absolute
    Error
     *             2               |                     84.6                                        |         2.47e-1
     *             3               |                     92.0                                        |         1.25e-1
     *             4               |                     96.1                                        |         6.25e-2
     *             5               |                     98.0                                        |         3.12e-2
     *             6               |                     99.0                                        |         1.56e-2
     *             7               |                     98.5                                        |         7.81e-3
     *             8               |                     97.0                                        |         3.91e-3
     *             9               |                     95.5                                        |         1.95e-3
     *            10               |                     93.3                                        |         9.77e-4
     *            11               |                     87.9                                        |         4.88e-4
     *            12               |                     74.3                                        |         2.44e-4
     *            13               |                     48.0                                        |         1.22e-4
     *            14               |                     20.3                                        |         6.11e-5
     *            15               |                     10.0                                        |         3.05e-5
     *            16               |                     5.00                                        |         1.53e-5
     *            17               |                     2.40                                        |         7.66e-6
     *            18               |                     1.42                                        |         3.84e-6
     *            19               |                     0.678                                       |         1.94e-6
     *            20               |                     0.197                                       |         1.01e-6
     *            21               |                     0.197                                       |         5.36e-7
     *            22               |                     0.0480                                      |         2.98e-7
     *            23               |                     0.0480                                      |         1.79e-7
     *            24               |                     0.0256                                      |         1.79e-7
     *            25               |                     0.0220                                      |         1.49e-7
     *            26               |                     0.0203                                      |         1.27e-7
     *            27               |                     0.0195                                      |         1.19e-7
     *            28               |                     0.0193                                      |         1.19e-7
     *            29               |                     0.0193                                      |         1.19e-7
     *
    

template<FracRepType numFracBits, typename T>
INLINE const qVar_t<T> incrementMagnitudeByHalf(const qVar_t<T> &op)

performs fixed point increment which involves adding (1 << (numFracBits - 1)) in magnitude

Return

qVar_t<T> Fixed point result

Template Parameters
  • numFracBits: Number of Fraction bits used to represent fixed point input

Parameters
  • op: represented in fixed point

template<FracRepType numFracBits, typename T>
INLINE const qVar_t<T> stdRound(const qVar_t<T> &op)

performs fixed point round which involves adding (1 << (numFracBits - 1)) then two shifts (and negation if necessary). Rounding at 0.5 is away from 0

Return

qVar_t<T> Fixed point result

Template Parameters
  • numFracBits: Number of Fraction bits used to represent fixed point input

Parameters
  • op: represented in fixed point

template<bool complexInput = true, typename DdrInOutSignalTensorShape, typename qvarElemType = typename DdrInOutSignalTensorShape::elemType>
INLINE void fft1d(DdrInOutSignalTensorShape &ddrIn, DdrInOutSignalTensorShape &ddrOut, MemAllocator &ocmMemAlloc)

Performs a 1D N point FFT on a given DDR signal. N needs to be a power of two We maps N point FFT using Cooley–Tukey FFT algorithm. Similar to the explanation found here https://www.cs.cmu.edu/afs/andrew/scs/cs/15-463/2001/pub/www/notes/fourier/fourier.pdf The diagram below show 2 weight matrixes for a 8 point FFT.

* Below Wn = e^(-2*pi*n/N)
* +--------------------------+   +--------------------------+    +-----+
* | 1   W0                   |   | 1 W0                     |    |  a0 |
* |                          |   |                          |    |     |
* |    1   W2                |   | 1 W4                     |    |  a4 |
* |                          |   |                          |    |     |
* | 1   W4                   |   |      1 W0                |    |  a2 |
* |                          |   |                          |    |     |
* |    1   W6                |   |      1 W4                |    |  a6 |
* |                          |   |                          |    |     |
* |             1   W0       |   |           1 W0           |    |  a1 |
* |                          |   |                          |    |     |
* |                1   W2    |   |           1 W4           |    |  a5 |
* |                          |   |                          |    |     |
* |             1   W4       |   |                 ++       |    |  a3 |
* |                          |   |                          |    |     |
* |                1   W6    |   |                    ++    |    |  a7 |
* |                          |   |                          |    |     |
* +--------------------------+   +--------------------------+    +-----+
*       Weight stage 1                   Weight stage 0          input
*   W0, W2, W4, W6, W0....         W0, W4, W0, W4, W0....       a0, a4, a2 ....
*
* Weights is a sparse matrix but stored in a packed manner. There are a total of log2(inputPoint) where inputPoints =
* len(input matrix).
*
* The Diagram below shows the interaction of weights and inputs for stage 0
* +-----------+        +-----------+        +-----------+       +-----------+
* |           |        |           |        |           |       |           |
* |     a0    | <----> |     a4    |        |     a2    | <---> |    a6     |
* | a0 + W0*a4|        | a0 + W4*a4|        | a2 + W4*a6|       | a2 + W4*a6|
* |           |        |           |        |           |       |           |
* +-----------+        +-----------+        +-----------+       +-----------+
*
* The Diagram below shows the interaction of weights and inputs for stage 1
*  +-----------+        +-----------+        +-----------+       +-----------+
*  |           | <-------------------------> |           |       |           |
*  |    a0     |        |    a1     |        |    a2     |       |    a3     |
*  | a0 + W0*a2|        | a1 + W2*a3|        | a0 + W4*a2|       | a1 + W6*a3|
*  |           |        |           | <------------------------> |           |
*  +-----------+        +-----------+        +-----------+       +-----------+
*
* There will be a total of log2(inputPoint) stages and for each stage the neighbor data is 2^stage distance away.
* e.g on a 8x8 array, for stages 0-2, the neighbor data lies within the same row, stages 3-5 lies within the same
* column and anything over lies withing the same array core since the data wraps around
*
* For inverse fft, imag part is negated before and after the forward fft, also before the output, both are divided
* by inputPoint
*
* ref: https://www.dsprelated.com/showarticle/800.php , method 4
*
*

deduced) DdrInOutSignalTensorShape DDR input tensor. Expected to be of the form <1, complexCount,1,inputPoints>. ComplexCount = 1 if only real or 2 if complex

Return

void

Template Parameters
  • complexInput: whether the input is complex or real

Template Parameters
  • (deduced): DdrWeightsTensorShape: DDR weight tensor. Expected to be of the form <1, complexCount,log2(inputPoints),inputPoints>. ComplexCount = 1 if only real or 2 if complex

  • (deduced): DdrWeightsTensorShape::elemType

Parameters
  • ddrIn: input ddr tensor

  • ddrOut: output ddr containing the final result

  • ocmMemAlloc: ocm allocator to be used locally in fft to allocate ocm memory space

template<typename DdrInOutSignalTensorShape, typename DdrWeightsTensorShape, typename qvarElemType = typename DdrWeightsTensorShape::elemType>
INLINE void fft2d(DdrInOutSignalTensorShape &ddrIn, DdrWeightsTensorShape &ddrWeights, DdrInOutSignalTensorShape &ddrOut, MemAllocator &ocmMemAlloc)

Performs a 2D N point FFT on a given a 2D DDR signal.

*           1D per row FFT                         Store the output at                   Transpose                    1D row FFT. This in essence is 1D col 
*                                                  bitreversed location                                               col FFT since we transposed it.
* +------------------------------+          +-----------------------------+       +----------------------------+       +----------------------------+
* +------------------------------+          |                             |       |                            |       +----------------------------+
* |    a0  a1  a2  a3 ...        +-----+ +->+    cc0  cc1  cc2  cc3  ...  |       |  cc0  bb0 ee0  aa0         |       |  cc0  bb0 ee0  aa0         |
* +------------------------------+     | |  |                             |       |                            |       +----------------------------+
* |    b0  b1  b2  b3 ...        +--------->+    bb0  bb1  bb2  bb3  ...  |       |  cc1  bb1 ee1  aa1         |       |  cc1  bb1 ee1  aa1         |
* +------------------------------+     | |  |                             | +---> |                     ...    | +---> +----------------------------+   +--->  Result (transposed)
* |    c0  c1  c2  c3 ...        +-------+  |    ee0  ee1  ee2  ee3  ...  |       |  cc2  bb2 ee2  aa2         |       |  cc2  bb2 ee2  aa2         |
* +------------------------------+     |    |                             |       |                            |       +----------------------------+
* |    d0  d1  d2  d3 ...        +--+  +--->+    aa0  aa1  aa2  aa3  ...  |       |  cc3  bb3 ee3  aa3         |       |  cc3  bb3 ee3  aa3         |
* +------------------------------+  |       |                             |       |                            |       +----------------------------+
* |                              |  |       |            ...              |       |  ...  ...  ...             |       |  ...  ...  ...             |
* |                              |  +------>+    dd0  dd1  dd2  dd3  ...  |       |                            |       |                            |
* |                              |          |                             |       |                            |       |                            |
* +------------------------------+          +-----------------------------+       +----------------------------+       +----------------------------+
*
Return

void

Template Parameters
  • (deduced): DdrInOutSignalTensorShape: DDR input tensor shape . Expected to be of the form <1, complexCount,inputPoints,inputPoints>. ComplexCount = 1 if only real or 2 if complex

  • (deduced): DdrWeightsTensorShape: DDR weight tensor. Expected to be of the form <1, complexCount,log2(inputPoints),inputPoints>. ComplexCount = 1 if only real or 2 if complex

  • (deduced): DdrWeightsTensorShape::elemType

Parameters
  • ddrIn: input ddr tensor

  • ddrWeights: ddr weights

  • ddrOut: output ddr containing the final result

  • ocmMemAlloc: ocm allocator to be used locally in fft to allocate ocm memory space

template<bool scale = false, typename OcmHammingWeightTensor>
INLINE void calculateHammingWeight(OcmHammingWeightTensor &ocmHammingWeight)

Calculates the hamming factor which can be used for hamming window calculation.

Template Parameters
  • scale: Used to scale the hamming Weights for the output.

  • OcmHammingWeightTensor: Type for the matrix ocmHammingFactor (In OCM)

Parameters
  • ocmHammingWeight: the output ocmTensor that will store the hamming weights.

template<typename OcmFrameTensor, typename OcmHammingWeightTensor>
INLINE void hamming(OcmFrameTensor &ocmFrameData, OcmHammingWeightTensor &ocmHammingWeight)

Applies a hamming window over a multi-channel signal using the pre-computed hamming factor.

Template Parameters
  • OcmFrameTensor: Type for the matrix ocmFrameData (In OCM)

  • OcmHammingWeightTensor: Type for the matrix ocmHammingWeight (In OCM)

Parameters
  • ocmFrameData: the input frame from all channels.

  • ocmHammingWeight: the output ocmTensor that will store the hamming weights.

template<typename OcmFrameTensor>
INLINE void hamming(OcmFrameTensor &ocmFrameData)

Applies a hamming window over a multi-channel signal.

Template Parameters
  • OcmFrameTensor: Type for the matrix ocmFrameData (In OCM)

Parameters
  • ocmFrameData: the input frame from all channels.

template<typename OcmHammingWeightTensor, typename dataType, FracRepType numFracBits, std::int32_t dataTiles>
INLINE void hamming(qVar_t<FixedPoint<dataType, numFracBits>> (&qData)[dataTiles], OcmHammingWeightTensor &ocmHammingWeight)

Applies a hamming window over a single-channel signal using pre-computed hamming weights.

Template Parameters
  • OcmHammingWeightTensor: Type for the matrix ocmHammingWeight (In OCM)

Parameters
  • qData: 1D qVar variable of length dataTiles where dataTiles = ceil(frameSize / Epu::numArrayCores).

  • ocmHammingWeight: the output ocmTensor that will store the hamming weights.

template<typename dataType, FracRepType numFracBits, std::int32_t dataTiles>
INLINE void hamming(qVar_t<FixedPoint<dataType, numFracBits>> (&qData)[dataTiles], std::int32_t frameSize)

Applies a hamming window over a single-channel signal spread across all cores.

Parameters
  • qData: 1D qVar variable of length dataTiles where dataTiles = ceil(frameSize / Epu::numArrayCores).

  • frameSize: std::int32_t variable specifying number of samples in a frame

template<typename OcmFrameShape, typename qTdoaType, typename qOutType, std::int32_t numChannels, std::int32_t dataTiles>
INLINE void delayAndSumFrame(OcmFrameShape &ocmFrame, qVar_t<qTdoaType> (&qTdoa)[numChannels], qVar_t<qOutType> (&qOut)[dataTiles])

Calculates delay-and-sum for a single frame of audio.

Parameters
  • ocmFrame: Ocm tensor with dimensions 1 x 1 x numChannels x frameSize. This stores audio data across channels (microphones) over time.

  • qTdoa: 1D qVar variable of length numChannels storing index delays to be applied to each channel. Each core has the same value for a given index. note: index_delay = sample_rate * time_delay.

  • qOut: 1D qVar variable of length dataTile to store output signal where dataTiles = ceil(frameSize / Epu::numArrayCores).

template<typename ocmAType, typename ocmBType, typename ocmXType>
void solve(ocmAType &ocmA, ocmBType &ocmB, ocmXType &ocmX)

Solves multiple independent matrix equations of the form Ax=b where A is non-singular small square matrix (matrixSize <= 20)

*  +---------------+
*  |  ocmA Tensor  |
*  +---------------+
*                      A matrices for each problem are stacked in x direction
*                      ----------->
*                   |  +------------------------------+-------------------------------+
*                   |  |                              |                               |
*                   |  |                              |                               |
*   matrix rows in  |  |<---------------------------->|<----------------------------->|
*     y direction   |  |       # problems in Roi      |       # problems in Roi       |
*                   |  |     (numCores x numCores)    |     (numCores x numCores)     |
*                   v  |                              |                               |
*                      +------------------------------+-------------------------------+
*
*                    matrix columns are in z direction (into page) with alternating
*                    real (even indicies) and imaginary (odd indicies) components.
*                    Thus, z dimension is of length 2 * matrixSize.
*
*
*  +---------------+
*  |  ocmB / ocmX  |
*  +---------------+
*                      b / x vectors for each problem are stacked in x direction
*                      ----------->
*                   |  +------------------------------+-------------------------------+
*                   |  |                              |                               |
*                   |  |                              |                               |
*  vector elements  |  |<---------------------------->|<----------------------------->|
*   in y direction  |  |       # problems in Roi      |       # problems in Roi       |
*                   |  |     (numCores x numCores)    |     (numCores x numCores)     |
*                   v  |                              |                               |
*                      +------------------------------+-------------------------------+
*
*                    real (index = 0) and imaginary (index = 1) components in z direction (into page).
*
*
*
*
*  For example, suppose we have 100 Ax=b problems to solve where each A is a 4x4 complex matrix. We need
*  to store all 100 matricies into one tensor. First let's consider the first matrix A1. Suppose A1 looks
*  like this:
*
*       ┌──────┬──────┬──────┬──────┐
*       │ 1+17i│ 2+18i│ 3+19i│ 4+20i│
*       ├──────┼──────┼──────┼──────┤
*       │ 5+21i│ 6+22i│ 7+23i│ 8+24i│
*  A1 = ├──────┼──────┼──────┼──────┤
*       │ 9+25i│10+26i│11+27i│12+28i│
*       ├──────┼──────┼──────┼──────┤
*       │13+29i│14+30i│15+31i│16+32i│
*       └──────┴──────┴──────┴──────┘
*
*  We first have to handle the real and complex parts. We transform A1 into a 4 x 8 matrix that has double
*  the number of columns. The real and imaginary parts of each original column are now their own individual
*  columns like so:
*
*      ┌────►z
*      │  Re Im Re Im Re Im Re Im
*      │ ┌──┬──┬──┬──┬──┬──┬──┬──┐
*      ▼ │ 1│17│ 2│18│ 3│19│ 4│20│
*      y ├──┼──┼──┼──┼──┼──┼──┼──┤
*        │ 5│21│ 6│22│ 7│23│ 8│24│
*  A1 =  ├──┼──┼──┼──┼──┼──┼──┼──┤
*        │ 9│25│11│26│12│27│13│28│
*        ├──┼──┼──┼──┼──┼──┼──┼──┤
*        │13│29│14│30│15│31│16│32│
*        └──┴──┴──┴──┴──┴──┴──┴──┘
*
*  The rows and columns of A1 will span the y and z directions of the final tensor that we will construct.
*
*  Now, we do this transformation to each matrix A1 to A100 and then stack them in the x direction. Thus,
*  the final tensor will have the dimensions (z, y, x) = (8, 4, 100).
*
*  Note: This implementation assumes input matrices are non-singular and does not check this assertion nor
*  gives notice to user. A notice or flag will be added in future.
*
*

Parameters
  • ocmA: A matrices in Ax=b. Input tensor with dimensions 1 x (2[=real+imaginary]*matrixSize) x matrixSize x numProblems

  • ocmB: b vectors in Ax=b. Input tensor with dimensions 1 x 2[=real+imaginary] x matrixSize x numProblems

  • ocmX: x vectors in Ax=b. Output tensor with dimensions 1 x 2[=real+imaginary] x matrixSize x numProblems

Examples

sin

  constexpr FracRepType            NUM_F_BITS = 22;
  qVar_t<FixedPoint32<NUM_F_BITS>> qPi        = 3.141592;

  qVar_t<FixedPoint32<NUM_F_BITS>> qAngle = 50 * qPi / 180;
  qVar_t<FixedPoint32<NUM_F_BITS>> qResult;

  qResult = qmath::sin(qAngle);  // Angle in radians {-pi , pi}

cos

  qResult = qmath::cos(qAngle);  // Angle in radians {-pi , pi}

log

  qVar_t<FixedPoint32<NUM_F_BITS>> qData = 10.8;
  qResult                                = qmath::log(qData);  // log base 10

GEMM

General Matrix Multiply, better known as GEMM, is an optimized API call to perform matrix multiplication.

template<typename AlphaType, typename OCMShapeA, typename OCMShapeB, typename OCMShapeC, typename BetaType>
void qmath::gemm(AlphaType alpha, OCMShapeA &ocmA, OCMShapeB &ocmB, BetaType beta, OCMShapeC &ocmC, OCMShapeC &ocmOut)

Calculate general matrix multiplication (gemm) out = alpha*A*B + beta*C https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3.

Template Parameters
  • AlphaType: Type for the scalar alpha

  • OCMShapeA: Type for the matrix A (In OCM)

  • OCMShapeB: Type for the matrix B (In OCM)

  • OCMShapeC: Type for the matrix C (In OCM)

  • BetaType: Type for the scalar beta

Parameters
  • alpha: scalar alpha

  • ocmA: matrix A

  • ocmB: matrix B

  • beta: scalar beta

  • ocmC: matrix C

  • ocmOut: output matrix

template<typename AlphaType, typename OCMShapeA, typename OCMShapeB, typename BetaType, typename OCMShapeC, isOcmTensor<OCMShapeA> = 0, isOcmTensor<OCMShapeB> = 0, isOcmTensor<OCMShapeC> = 0>
void qmath::gemmMatrixMulMatrixPacked(AlphaType alpha, OCMShapeA &matA, OCMShapeB &matB, BetaType beta, OCMShapeC &matC, OCMShapeC &matOut)

A specialized version of matrixMulMatrix for GEMM, utilizing packed flow may cause regression on small matrices.

Template Parameters
  • type: for the scalar alpha

  • OCMShapeA: Type for the matrix A (In OCM)

  • OCMShapeB: Type for the matrix B (In OCM)

  • OCMShapeC: Type for the matrix C (In OCM)

Parameters
  • alpha: the scalar alpha

  • matA: the first matrix, A

  • matB: the second matrix, B

  • matC: the result matrix, C

Examples

gemm()

gemm.hpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

#include <quadric/qil.h>

const std::int32_t                numFracBits = 4;
typedef FixedPoint16<numFracBits> MatT;

const std::int32_t matrixAWidth  = 1024;
const std::int32_t matrixAHeight = 1024;
const std::int32_t matrixBWidth  = 1024;

typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> DdrAInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> OcmAInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  DdrBInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  OcmBInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> DdrCInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> OcmCInOutShape;

EPU_ENTRY void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
                        DdrAInOutShape::ptrType        aPtr,
                        DdrBInOutShape::ptrType        bPtr,
                        DdrAInOutShape::UnderlyingType betaIn,
                        DdrCInOutShape::ptrType        cPtr,
                        DdrCInOutShape::ptrType        outPtr);

gemm.cpp :

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

/**
 * @file gemm.cpp
 */

#include "gemm.hpp"
#include <qmath.hpp>

void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
              DdrAInOutShape::ptrType        aPtr,
              DdrBInOutShape::ptrType        bPtr,
              DdrAInOutShape::UnderlyingType betaIn,
              DdrCInOutShape::ptrType        cPtr,
              DdrCInOutShape::ptrType        outPtr) {
  MatT alpha;
  alpha.value = alphaIn;
  MatT beta;
  beta.value = betaIn;

  DdrAInOutShape a(aPtr);
  DdrBInOutShape b(bPtr);
  DdrCInOutShape c(cPtr);
  DdrCInOutShape out(outPtr);

  MemAllocator ocmMem;

  OcmAInOutShape ocmA;
  OcmBInOutShape ocmB;
  OcmCInOutShape ocmC;
  OcmCInOutShape ocmOut;
  ocmMem.allocate(ocmA);
  ocmMem.allocate(ocmB);
  ocmMem.allocate(ocmC);
  ocmMem.allocate(ocmOut);

  memCpy(a, ocmA);
  memCpy(b, ocmB);
  memCpy(c, ocmC);

  qmath::gemm(alpha, ocmA, ocmB, beta, ocmC, ocmOut);

  memCpy(ocmOut, out);
}

gemm_host.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */
#include <quadric/host.h>
#include "gemm.hpp"

void generateOutputTensorAddToB(DdrCInOutShape& matAPtr, DdrCInOutShape& matBPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  auto matB = DdrCInOutShape::cast(matBPtr);

  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matB)[0][0][heightItr][widthItr] += (*matA)[0][0][heightItr][widthItr];
    }
  }
}

void generateOutputTensorMatrixMulMatrix(DdrAInOutShape& matAPtr, DdrBInOutShape& matBPtr, DdrCInOutShape& outPtr) {
  auto matA = DdrAInOutShape::cast(matAPtr);
  auto matB = DdrBInOutShape::cast(matBPtr);
  auto out  = DdrCInOutShape::cast(outPtr);
  clearTensor(outPtr);
  for(std::int32_t i = 0; i < DdrAInOutShape::NUM_ROWS; i++) {
    for(std::int32_t j = 0; j < DdrBInOutShape::NUM_COLS; j++) {
      if(std::is_same<MatT, FixedPoint16<numFracBits>>::value) {
        // fx16, MAC supported, use the higher precision way
        float sum = 0;
        for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
          sum += fixedToFloat<numFracBits>((*matA)[0][0][i][k]) * fixedToFloat<numFracBits>((*matB)[0][0][k][j]);
        }
        (*out)[0][0][i][j] = floatToFixed<numFracBits>(sum);
      } else {
        for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
          (*out)[0][0][i][j] += floatToFixed<numFracBits>(fixedToFloat<numFracBits>((*matA)[0][0][i][k]) *
                                                          fixedToFloat<numFracBits>((*matB)[0][0][k][j]));
        }
      }
    }
  }
}

void generateOutputTensorScalarMulMatrix(MatT alpha, DdrCInOutShape& matAPtr) {
  auto matA       = DdrCInOutShape::cast(matAPtr);
  auto floatAlpha = fixedToFloat<numFracBits>(alpha.value);
  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matA)[0][0][heightItr][widthItr] =
        floatToFixed<numFracBits>(fixedToFloat<numFracBits>((*matA)[0][0][heightItr][widthItr]) * floatAlpha);
    }
  }
}

// out = alpha*a*b + beta*c
// modifies c
void generateOutputTensorGemm(MatT            alpha,
                              DdrAInOutShape& matAPtr,
                              DdrBInOutShape& matBPtr,
                              MatT            beta,
                              DdrCInOutShape& matCPtr,
                              DdrCInOutShape& outPtr) {
  generateOutputTensorMatrixMulMatrix(matAPtr, matBPtr, outPtr);
  generateOutputTensorScalarMulMatrix(alpha, outPtr);
  generateOutputTensorScalarMulMatrix(beta, matCPtr);
  generateOutputTensorAddToB(matCPtr, outPtr);
}
HOST_MAIN(
  // clang-format off
  std::mt19937 randGen(std::random_device{}());
  std::uniform_real_distribution<float> urd(0,1);
  MatT alpha = urd(randGen);
  MatT beta = urd(randGen);

  DdrAInOutShape matAInp;
  DdrAInOutShape::allocate(matAInp);
  DdrBInOutShape matBInp;
  DdrBInOutShape::allocate(matBInp);
  DdrCInOutShape matCInp;
  DdrCInOutShape::allocate(matCInp);
  DdrCInOutShape matC2Inp;
  DdrCInOutShape::allocate(matC2Inp);

  DdrCInOutShape ddrOut;
  DdrCInOutShape::allocate(ddrOut);
  DdrCInOutShape expectedOut;
  DdrCInOutShape::allocate(expectedOut);

  populateTensorRand(matAInp, 0<<numFracBits, 1<<numFracBits);
  populateTensorRand(matBInp, 1<<numFracBits, 2<<numFracBits);
  populateTensorRand(matCInp, 1<<numFracBits, 2<<numFracBits);
  copyTensor(matCInp,matC2Inp);

  // Generate expected output.
  generateOutputTensorGemm(alpha, matAInp, matBInp, beta, matC2Inp, expectedOut);


  TensorArg<DdrAInOutShape> inputArgA{&matAInp};
  TensorArg<DdrBInOutShape> inputArgB{&matBInp};
  TensorArg<DdrCInOutShape> inputArgC{&matCInp};

  TensorArg<DdrCInOutShape> outputArg{&ddrOut, &expectedOut, FixedPoint16<numFracBits>(0.5).value};
  packageKernel(OUTPUT_PREFIX, FUNC_ARG(gemmtest), alpha.value, inputArgA, inputArgB, beta.value, inputArgC, outputArg);

  DeviceBufferRef aIn, bIn, cIn, out;

  auto device = DeviceManager().getDevice(callKernelGlobals.simConfig);
  device.loadKernel();
  device.allocateAndCopyToDevice(matAInp, aIn);
  device.allocateAndCopyToDevice(matBInp, bIn);
  device.allocateAndCopyToDevice(matCInp, cIn);
  device.allocate<DdrCInOutShape>(out);

  device.runKernel(ENTRYPOINT(gemmtest), alpha.value, aIn, bIn, beta.value, cIn, out);


  device.copyBufferFromDevice(out, ddrOut);

  auto nativeCompareVisitor = [](const auto &t1, const auto &t2) {
    return nativeCompare<DdrCInOutShape>(t1, t2, FixedPoint16<numFracBits>(0.5).value);
  };

  runtime_assert(compareTensors(ddrOut, expectedOut, nativeCompareVisitor), "tensor mismatch");
  // clang-format on
)

gemmMatrixMulMatrixPacked

gemm_packed.hpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

#include <quadric/qil.h>

typedef std::int8_t MatT;

const std::int32_t matrixAWidth  = 1024;
const std::int32_t matrixAHeight = 1024;
const std::int32_t matrixBWidth  = 1024;

typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> DdrAInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixAWidth> OcmAInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  DdrBInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAWidth, matrixBWidth>  OcmBInOutShape;
typedef DdrTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> DdrCInOutShape;
typedef OcmTensor<MatT, 1, 1, matrixAHeight, matrixBWidth> OcmCInOutShape;

EPU_ENTRY void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
                        DdrAInOutShape::ptrType        aPtr,
                        DdrBInOutShape::ptrType        bPtr,
                        DdrAInOutShape::UnderlyingType betaIn,
                        DdrCInOutShape::ptrType        cPtr,
                        DdrCInOutShape::ptrType        outPtr);

gemm_packed.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */

/**
 * @file gemm_packed.cpp
 */

#include "gemm_packed.hpp"
#include <qmath.hpp>

void gemmtest(DdrAInOutShape::UnderlyingType alphaIn,
              DdrAInOutShape::ptrType        aPtr,
              DdrBInOutShape::ptrType        bPtr,
              DdrAInOutShape::UnderlyingType betaIn,
              DdrCInOutShape::ptrType        cPtr,
              DdrCInOutShape::ptrType        outPtr) {
  MatT alpha;
  alpha = alphaIn;
  MatT beta;
  beta = betaIn;

  DdrAInOutShape a(aPtr);
  DdrBInOutShape b(bPtr);
  DdrCInOutShape c(cPtr);
  DdrCInOutShape out(outPtr);

  MemAllocator ocmMem;

  OcmAInOutShape ocmA;
  OcmBInOutShape ocmB;
  OcmCInOutShape ocmC;
  OcmCInOutShape ocmOut;
  ocmMem.allocate(ocmA);
  ocmMem.allocate(ocmB);
  ocmMem.allocate(ocmC);
  ocmMem.allocate(ocmOut);

  memCpy(a, ocmA);
  memCpy(b, ocmB);
  memCpy(c, ocmC);

  qmath::gemmMatrixMulMatrixPacked(alpha, ocmA, ocmB, beta, ocmC, ocmOut);

  memCpy(ocmOut, out);
}

gemm_packed_host.cpp:

/*
 * QUADRIC.IO CONFIDENTIAL
 * __________________
 *
 * [2020] quadric.io Incorporated
 * All Rights Reserved.
 *
 * NOTICE: All information contained herein is, and remains
 * the property of quadric.io Incorporated and its suppliers,
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to quadric.io Incorporated
 * and its suppliers and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from quadric.io Incorporated.
 */
#include <quadric/host.h>
#include "gemm_packed.hpp"

void generateOutputTensorAddToB(DdrCInOutShape& matAPtr, DdrCInOutShape& matBPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  auto matB = DdrCInOutShape::cast(matBPtr);

  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matB)[0][0][heightItr][widthItr] += (*matA)[0][0][heightItr][widthItr];
    }
  }
}

void generateOutputTensorMatrixMulMatrix(DdrAInOutShape& matAPtr, DdrBInOutShape& matBPtr, DdrCInOutShape& outPtr) {
  auto matA = DdrAInOutShape::cast(matAPtr);
  auto matB = DdrBInOutShape::cast(matBPtr);
  auto out  = DdrCInOutShape::cast(outPtr);
  clearTensor(outPtr);
  for(std::int32_t i = 0; i < DdrAInOutShape::NUM_ROWS; i++) {
    for(std::int32_t j = 0; j < DdrBInOutShape::NUM_COLS; j++) {
      for(std::int32_t k = 0; k < DdrAInOutShape::NUM_COLS; k++) {
        (*out)[0][0][i][j] += (*matA)[0][0][i][k] * (*matB)[0][0][k][j];
      }
    }
  }
}

void generateOutputTensorScalarMulMatrix(MatT alpha, DdrCInOutShape& matAPtr) {
  auto matA = DdrCInOutShape::cast(matAPtr);
  for(std::int32_t heightItr = 0; heightItr < DdrCInOutShape::NUM_ROWS; heightItr++) {
    for(std::int32_t widthItr = 0; widthItr < DdrCInOutShape::NUM_COLS; widthItr++) {
      (*matA)[0][0][heightItr][widthItr] = (*matA)[0][0][heightItr][widthItr] * alpha;
    }
  }
}

// out = alpha*a*b + beta*c
// modifies c
void generateOutputTensorGemm(MatT            alpha,
                              DdrAInOutShape& matAPtr,
                              DdrBInOutShape& matBPtr,
                              MatT            beta,
                              DdrCInOutShape& matCPtr,
                              DdrCInOutShape& outPtr) {
  generateOutputTensorMatrixMulMatrix(matAPtr, matBPtr, outPtr);
  generateOutputTensorScalarMulMatrix(alpha, outPtr);
  generateOutputTensorScalarMulMatrix(beta, matCPtr);
  generateOutputTensorAddToB(matCPtr, outPtr);
}
HOST_MAIN(
  // clang-format off
  std::mt19937 randGen(std::random_device{}());
  std::uniform_real_distribution<float> urd(1,3);
  MatT alpha = urd(randGen);
  MatT beta = urd(randGen);

  DdrAInOutShape matAInp;
  DdrAInOutShape::allocate(matAInp);
  DdrBInOutShape matBInp;
  DdrBInOutShape::allocate(matBInp);
  DdrCInOutShape matCInp;
  DdrCInOutShape::allocate(matCInp);
  DdrCInOutShape matC2Inp;
  DdrCInOutShape::allocate(matC2Inp);

  DdrCInOutShape ddrOut;
  DdrCInOutShape::allocate(ddrOut);
  DdrCInOutShape expectedOut;
  DdrCInOutShape::allocate(expectedOut);

  populateTensorRand(matAInp, 0, 2);
  populateTensorRand(matBInp, 0, 2);
  populateTensorRand(matCInp, 0, 2);
  copyTensor(matCInp,matC2Inp);

  // Generate expected output.
  generateOutputTensorGemm(alpha, matAInp, matBInp, beta, matC2Inp, expectedOut);

  TensorArg<DdrAInOutShape> inputArgA{&matAInp};
  TensorArg<DdrBInOutShape> inputArgB{&matBInp};
  TensorArg<DdrCInOutShape> inputArgC{&matCInp};

  TensorArg<DdrCInOutShape> outputArg{&ddrOut, &expectedOut};
  packageKernel(OUTPUT_PREFIX, FUNC_ARG(gemmtest), alpha, inputArgA, inputArgB, beta, inputArgC, outputArg);

  DeviceBufferRef aIn, bIn, cIn, out;

  auto device = DeviceManager().getDevice(callKernelGlobals.simConfig);
  device.loadKernel();
  device.allocateAndCopyToDevice(matAInp, aIn);
  device.allocateAndCopyToDevice(matBInp, bIn);
  device.allocateAndCopyToDevice(matCInp, cIn);
  device.allocate<DdrCInOutShape>(out);

  device.runKernel(ENTRYPOINT(gemmtest), alpha, aIn, bIn, beta, cIn, out);

  device.copyBufferFromDevice(out, ddrOut);

  auto nativeCompareVisitor = [](const auto &t1, const auto &t2) {
    return nativeCompare<DdrCInOutShape>(t1, t2);
  };

  runtime_assert(compareTensors(ddrOut, expectedOut, nativeCompareVisitor), "Tensors did not match");
  // clang-format on
)