OCM <-> Array Data Fetching

Fetching Data from OCM to the Array

For explanation and further examples of the below documented API, see here.

template<std::int32_t numTopZerosRows, std::int32_t numDataRows, std::int32_t numBottomZerosRows, Epu::StreamSide streamSide = Epu::StreamSide::SOUTH, typename TensorShape, typename tensorElemType>
INLINE void fetchTileBlock(TensorShape &srcTensor, qVar_t<tensorElemType> dstArray[], std::int32_t roiBatchOffset = 0, std::int32_t roiChOffset = 0, std::int32_t roiHeightOffset = 0, std::int32_t roiWidthOffset = 0)

Fetch only one tileBlock requested from the provided offset only applies to case IteratorType::ZY_BORDER.

* 
*              xx──────────────xx──────────────────────xx
*             xx              xx                      xx
*           xx │            xx                      xx │
*         xxx  │          xxx                     xxx  │
*       xxx    │        xxx                     xxx    │
*      xx      │       xx                      xx      │
*    xx        │     xx                      xx        │
*  xx          │    xx                      xx         │
* ┌────────────┼───x───────────────────────x           │
* │            │   │                       │           │
* │           xx   │           xx          │           │
* │          x ┼   │          xx           │           │
* │   Tile Block   │        xx             │           │
* │      x     │   │      xxx              │           │
* │    xxx     │   │    xxx                │           │
* │   xx       │   │   xx                  │           │
* │ xx         │   │ xx                    │           │
* │xx          │   │xx                     │           │
* x────────────┼───x                       │           │
* │            │                           │           │
* │            │                           │           │
* │            │                           │           │
* │           xx───────────────────────────┼──────────xx
* │          xx                            │         xx
* │        xx                              │       xx
* │      xxx                               │     xxx
* │    xxx                                 │   xxx
* │   xx                                   │  xx
* │ xx                                     │xx
* │xx                                      xx
* x───────────────────────────────────────x
*
*
Template Parameters
  • numTopZerosRows: Number of top zero rows in a tile. e.g on 8x4, a the first tile would say 4. Subsequent tiles would say 0

  • numDataRows: Number of data rows making up a tile. e.g on 8x4, a the first tile would say 12 (8cores + 4bottom border)

  • numBottomZerosRows: Number of bottom zero rows. e.g on 8x4, a the first tile would say 0 and the last tile in the height dimension would be at least 4

  • streamSide: Direction of flow

  • TensorShape: Type defining tensor shape

  • tensorElemType: data type of the elements stored in the tensor

Parameters
  • srcTensor: source tensor

  • dstArray: destination array of qVar<T>’s’

  • roiBatchOffset: batch offset

  • roiChOffset: channel offset

  • roiHeightOffset: height offset

  • roiWidthOffset: width offset

template<IteratorType iterator = IteratorType::ZY_BORDER, Epu::StreamSide streamSide = Epu::StreamSide::SOUTH, typename TensorShape, typename tensorElemType>
INLINE std::enable_if<iterator == IteratorType::ZY_BORDER, void>::type fetchAllTiles(TensorShape &srcTensor, qVar_t<tensorElemType> dstArray[])

Fetches all tiles using the ZY Iteration pattern.

Parameters
  • srcTensor: The source tensor

  • dstArray: The destination array of qVar<T>’s’

Template Parameters
  • iterator: iterator pattern of flow

  • streamSide: stream side of flow

  • TensorShape: shape of the tensor

  • tensorElemType: type of the elements in the tensor

Example

The following example outlines the flow of a tensor (1x3x8x8) into the array using fetchAllTiles API:

  constexpr std::uint32_t WIDTH  = 8;
  constexpr std::uint32_t HEIGHT = 8;
  constexpr std::uint32_t DEPTH  = 3;

  // Defining the Tensor shape as a typedef avoids being verbose when the shape information is required later in the
  // code.
  typedef OcmTensor<std::int32_t, 1, DEPTH, HEIGHT, WIDTH> OcmTensorShape3;
  MemAllocator                                             ocmMem3;
  OcmTensorShape3                                          ocmTensor3;

  // allocate local memory on array. The tensor typedef provides helper enums to access
  qVar_t<std::int32_t> qData[OcmTensorShape3::NUM_TILES];

  // Fetch all the data from OCM into the Array
  fetchAllTiles(ocmTensor3, qData);

Writing Data to OCM from the Array

For explanation and further examples of the below documented API, see here.

template<IteratorType iterator = IteratorType::ZY_NO_BORDER, Epu::StreamSide streamSide = Epu::StreamSide::NORTH, std::int32_t coreHeightOffset = 0, std::int32_t coreWidthOffset = 0, typename TensorShape, typename tensorElemType>
INLINE void writeAllTiles(qVar_t<tensorElemType> srcArray[], TensorShape &dstTensor)

Write all tiles to OCM from a qVar_t<T>

Parameters
  • srcArray: The array of qVar<T>’s’

  • dstTensor: The tensor

Template Parameters
  • iterator: iterator pattern of flow

  • streamSide: stream side of flow

  • coreHeightOffset: height offset of the tile data

  • coreWidthOffset: width offset of the tile data

  • TensorShape: shape of the tensor

  • tensorElemType: type of the elements in the tensor

Example

Similar to fetch APIs, the write functions are used to flow data from the array to OCM:

  writeTilesInRoi<RoiShape, IteratorType::XY_NO_BORDER>(qRoiData, ocmTensor4, 0, 0, 5, 16);

Broadcasting Data From OCM to All Cores on the Array

For explanation and further examples of the below documented API, see here

template<typename TensorShape, typename RoiShape, std::uint8_t numberOfPartitions = 1, size_t ArrayElemTypeSize = sizeof(typename TensorShape::elemType)>
INLINE static void stage(TensorShape &tensor, std::int32_t roiBatchOffset = 0, std::int32_t roiChOffset = 0, std::int32_t roiHeightOffset = 0, std::int32_t roiWidthOffset = 0)

Queue a Tensor to be broadcasted using the Broadcast bus.

Parameters
  • tensor: The tensor from OCM that is going to be queued.

  • [in] roiBatchOffset: The roi batch offset

  • [in] roiChOffset: The roi ch offset

  • [in] roiHeightOffset: The roi height offset

  • [in] roiWidthOffset: The roi width offset

Template Parameters
  • TensorShape: The shape of the input tensor.

  • RoiShape: The shape of the region of interest (RoI) to be utilized in the broadcast

  • numberOfPartitions: The number of partitions used for Virtual Array Partitioning (Ignore.)

  • ArrayElemTypeSize: The size of each array element. Deduced for the user.

Example

Below, see an example of a complete broadcast, from OCM setup to array-side consumption:

typedef DdrTensor<std::int32_t, 1, 1, Epu::coreDim, Epu::coreDim> DdrInOutShape;
typedef OcmTensor<std::int32_t, 1, 1, Epu::coreDim, Epu::coreDim> OcmInOutShape;

using RoiShape = DdrInOutShape;

typedef OcmTensor<char, RoiShape::NUM_BCH, RoiShape::NUM_CHN, RoiShape::NUM_ROWS, RoiShape::NUM_COLS> RoiShapeDesc;

//! [Tensor Defs]

EPU_ENTRY void broadcastStream(DdrInOutShape::ptrType ddrInPtr,
                               DdrInOutShape::ptrType ddrOutPtr8,
                               DdrInOutShape::ptrType ddrOutPtr16,
                               DdrInOutShape::ptrType ddrOutPtr32) {
  DdrInOutShape ddrIn(ddrInPtr);
  DdrInOutShape ddrOut8(ddrOutPtr8);
  DdrInOutShape ddrOut16(ddrOutPtr16);
  DdrInOutShape ddrOut32(ddrOutPtr32);

  MemAllocator ocmMem;
  //! [OCM var def]
  OcmInOutShape ocmInp;
  //! [OCM var def]
  ocmMem.allocate<OcmInOutShape>(ocmInp);
  OcmInOutShape ocmOutp8;
  ocmMem.allocate<OcmInOutShape>(ocmOutp8);
  OcmInOutShape ocmOutp16;
  ocmMem.allocate<OcmInOutShape>(ocmOutp16);
  OcmInOutShape ocmOutp32;
  ocmMem.allocate<OcmInOutShape>(ocmOutp32);

  memCpy<DdrInOutShape, OcmInOutShape>(ddrIn, ocmInp);

  constexpr std::int32_t numBroadcasts8  = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int8_t>();
  constexpr std::int32_t numBroadcasts16 = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int16_t>();
  //! [Array Var Setup]
  constexpr std::int32_t numBroadcasts32 = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int32_t>();

  // Array to hold all the weights
  qVar_t<std::int32_t> arr[getNumberOfWeightRegisters<std::int8_t>()];
  // Our output data
  qVar_t<std::int32_t> out[1];
  // Initialize output to zero
  out[0] = 0;
  //! [Array Var Setup]

  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);

  for(std::int32_t i = 0; i < numBroadcasts8; i++) {
    arr[0] = qBroadcast<0, std::int8_t, BroadcastAction::POP>;
    arr[1] = qBroadcast<1, std::int8_t>;
    arr[2] = qBroadcast<2, std::int8_t>;
    arr[3] = qBroadcast<3, std::int8_t>;
    arr[4] = qBroadcast<4, std::int8_t>;
    arr[5] = qBroadcast<5, std::int8_t>;
    arr[6] = qBroadcast<6, std::int8_t>;
    arr[7] = qBroadcast<7, std::int8_t>;

    out[0] += (arr[0] + arr[1] + arr[2] + arr[3] + arr[4] + arr[5] + arr[6] + arr[7]);
  }

  writeAllTiles(out, ocmOutp8);
  memCpy<OcmInOutShape, DdrInOutShape>(ocmOutp8, ddrOut8);

  // Clear Broadcast Bus.
  for(std::int32_t i = 0; i < numBroadcasts32 - numBroadcasts8; i++) {
    arr[0] = qBroadcast<0, std::int32_t, BroadcastAction::POP>;
  }

  // Initialize output to zero
  out[0] = 0;
  // Broadcast data.
  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);

  for(std::int32_t i = 0; i < numBroadcasts16; i++) {
    arr[0] = qBroadcast<0, std::int16_t, BroadcastAction::POP>;
    arr[1] = qBroadcast<1, std::int16_t>;
    arr[2] = qBroadcast<2, std::int16_t>;
    arr[3] = qBroadcast<3, std::int16_t>;

    out[0] += (arr[0] + arr[1] + arr[2] + arr[3]);
  }
  writeAllTiles(out, ocmOutp16);
  memCpy<OcmInOutShape, DdrInOutShape>(ocmOutp16, ddrOut16);

  // Clear Broadcast Bus.
  for(std::int32_t i = 0; i < numBroadcasts32 - numBroadcasts16; i++) {
    arr[0] = qBroadcast<0, std::int32_t, BroadcastAction::POP>;
  }
  // Initialize output to zero
  out[0] = 0;

  //! [Broadcast Setup]
  // Broadcast data.
  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);
  //! [Broadcast Setup]