OCM <-> Array Data Fetching

Fetching Data from OCM to the Array

For explanation and further examples of the below documented API, see here.

Example

The following example outlines the flow of a tensor (1x3x8x8) into the array using fetchAllTiles API:

  constexpr std::uint32_t WIDTH  = 8;
  constexpr std::uint32_t HEIGHT = 8;
  constexpr std::uint32_t DEPTH  = 3;

  // Defining the Tensor shape as a typedef avoids being verbose when the shape information is required later in the
  // code.
  typedef OcmTensor<std::int32_t, 1, DEPTH, HEIGHT, WIDTH> OcmTensorShape3;
  MemAllocator                                             ocmMem3;
  OcmTensorShape3                                          ocmTensor3;

  // allocate local memory on array. The tensor typedef provides helper enums to access
  qVar_t<std::int32_t> qData[OcmTensorShape3::NUM_TILES];

  // Fetch all the data from OCM into the Array
  fetchAllTiles(ocmTensor3, qData);

Writing Data to OCM from the Array

For explanation and further examples of the below documented API, see here.

Example

Similar to fetch APIs, the write functions are used to flow data from the array to OCM:

  writeTilesInRoi<RoiShape, IteratorType::XY_NO_BORDER>(qRoiData, ocmTensor4, 0, 0, 5, 16);

Broadcasting Data From OCM to All Cores on the Array

For explanation and further examples of the below documented API, see here

template<typename TensorShape, typename RoiShape, std::uint8_t numberOfPartitions = 1, size_t ArrayElemTypeSize = sizeof(typename TensorShape::elemType)>
INLINE static void stage(TensorShape &tensor, std::int32_t roiBatchOffset = 0, std::int32_t roiChOffset = 0, std::int32_t roiHeightOffset = 0, std::int32_t roiWidthOffset = 0)

Queue a Tensor to be broadcasted using the Broadcast bus.

Parameters
  • tensor: The tensor from OCM that is going to be queued.

  • [in] roiBatchOffset: The roi batch offset

  • [in] roiChOffset: The roi ch offset

  • [in] roiHeightOffset: The roi height offset

  • [in] roiWidthOffset: The roi width offset

Template Parameters
  • TensorShape: The shape of the input tensor.

  • RoiShape: The shape of the region of interest (RoI) to be utilized in the broadcast

  • numberOfPartitions: The number of partitions used for Virtual Array Partitioning (Ignore.)

  • ArrayElemTypeSize: The size of each array element. Deduced for the user.

Example

Below, see an example of a complete broadcast, from OCM setup to array-side consumption:

typedef DdrTensor<std::int32_t, 1, 1, Epu::coreDim, Epu::coreDim> DdrInOutShape;
typedef OcmTensor<std::int32_t, 1, 1, Epu::coreDim, Epu::coreDim> OcmInOutShape;

using RoiShape = DdrInOutShape;

typedef OcmTensor<char, RoiShape::NUM_BCH, RoiShape::NUM_CHN, RoiShape::NUM_ROWS, RoiShape::NUM_COLS> RoiShapeDesc;

//! [Tensor Defs]

EPU_ENTRY void broadcastStream(DdrInOutShape::ptrType ddrInPtr,
                               DdrInOutShape::ptrType ddrOutPtr8,
                               DdrInOutShape::ptrType ddrOutPtr16,
                               DdrInOutShape::ptrType ddrOutPtr32) {
  DdrInOutShape ddrIn(ddrInPtr);
  DdrInOutShape ddrOut8(ddrOutPtr8);
  DdrInOutShape ddrOut16(ddrOutPtr16);
  DdrInOutShape ddrOut32(ddrOutPtr32);

  MemAllocator ocmMem;
  //! [OCM var def]
  OcmInOutShape ocmInp;
  //! [OCM var def]
  ocmMem.allocate<OcmInOutShape>(ocmInp);
  OcmInOutShape ocmOutp8;
  ocmMem.allocate<OcmInOutShape>(ocmOutp8);
  OcmInOutShape ocmOutp16;
  ocmMem.allocate<OcmInOutShape>(ocmOutp16);
  OcmInOutShape ocmOutp32;
  ocmMem.allocate<OcmInOutShape>(ocmOutp32);

  memCpy<DdrInOutShape, OcmInOutShape>(ddrIn, ocmInp);

  constexpr std::int32_t numBroadcasts8  = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int8_t>();
  constexpr std::int32_t numBroadcasts16 = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int16_t>();
  //! [Array Var Setup]
  constexpr std::int32_t numBroadcasts32 = DdrInOutShape::linearElemCount / getNumberOfWeightRegisters<std::int32_t>();

  // Array to hold all the weights
  qVar_t<std::int32_t> arr[getNumberOfWeightRegisters<std::int8_t>()];
  // Our output data
  qVar_t<std::int32_t> out[1];
  // Initialize output to zero
  out[0] = 0;
  //! [Array Var Setup]

  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);

  for(std::int32_t i = 0; i < numBroadcasts8; i++) {
    arr[0] = qBroadcast<0, std::int8_t, BroadcastAction::POP>;
    arr[1] = qBroadcast<1, std::int8_t>;
    arr[2] = qBroadcast<2, std::int8_t>;
    arr[3] = qBroadcast<3, std::int8_t>;
    arr[4] = qBroadcast<4, std::int8_t>;
    arr[5] = qBroadcast<5, std::int8_t>;
    arr[6] = qBroadcast<6, std::int8_t>;
    arr[7] = qBroadcast<7, std::int8_t>;

    out[0] += (arr[0] + arr[1] + arr[2] + arr[3] + arr[4] + arr[5] + arr[6] + arr[7]);
  }

  writeAllTiles(out, ocmOutp8);
  memCpy<OcmInOutShape, DdrInOutShape>(ocmOutp8, ddrOut8);

  // Clear Broadcast Bus.
  for(std::int32_t i = 0; i < numBroadcasts32 - numBroadcasts8; i++) {
    arr[0] = qBroadcast<0, std::int32_t, BroadcastAction::POP>;
  }

  // Initialize output to zero
  out[0] = 0;
  // Broadcast data.
  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);

  for(std::int32_t i = 0; i < numBroadcasts16; i++) {
    arr[0] = qBroadcast<0, std::int16_t, BroadcastAction::POP>;
    arr[1] = qBroadcast<1, std::int16_t>;
    arr[2] = qBroadcast<2, std::int16_t>;
    arr[3] = qBroadcast<3, std::int16_t>;

    out[0] += (arr[0] + arr[1] + arr[2] + arr[3]);
  }
  writeAllTiles(out, ocmOutp16);
  memCpy<OcmInOutShape, DdrInOutShape>(ocmOutp16, ddrOut16);

  // Clear Broadcast Bus.
  for(std::int32_t i = 0; i < numBroadcasts32 - numBroadcasts16; i++) {
    arr[0] = qBroadcast<0, std::int32_t, BroadcastAction::POP>;
  }
  // Initialize output to zero
  out[0] = 0;

  //! [Broadcast Setup]
  // Broadcast data.
  BroadcastStream::stage<OcmInOutShape, RoiShapeDesc>(ocmInp);
  //! [Broadcast Setup]