板端部署

在完成前述模型的一系列转换编译等操作后，在开发板端，我们提供了统一异构计算平台UCP，来帮助您快速完成模型的部署工作，并提供了相关示例。

对于模型的板端部署，可以简单分为三个步骤，构建板端运行示例、板端运行准备及部署后的板端运行。

构建板端运行示例

在板端运行示例阶段，需要进行如下准备工作：

准备板端示例运行所需依赖库，我们在OE包中samples/ucp_tutorial/deps_aarch64已提供对应内容，建议您使用此目录下的依赖库，快速完成此部分准备工作。
准备板端示例开发相关文件（main.cc）。

注意

准备main.cc文件需要根据模型输入，对main函数和输入Tensor进行调整。

准备交叉编译生成板端可执行程序（CMakeLists.txt），内容需要涵盖对GCC的编译参数、依赖的头文件及动态库的配置。

Pyramid输入模型main.cc准备

对于Pyramid输入的模型，我们在OE包的samples/ucp_tutorial/dnn/basic_samples/code/00_quick_start路径下为您提供了相应的示例。这里我们对main.cc中的main函数和准备输入输出tensor进行一下解析：

/**
 * Step1: get model handle
 * Step2: prepare input and output tensor
 * Step3: set input data to input tensor
 * Step4: run inference
 * Step5: do postprocess with output data
 * Step6: release resources
 */
 
#include <fstream>
#include <iostream>
#include <vector>
#include <cstring>

#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"

#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)

const char* hbm_path = "model.hbm";
std::string data_y_path = "ILSVRC2012_val_00000001_y.bin";
std::string data_uv_path = "ILSVRC2012_val_00000001_uv.bin";

// Read binary input file
int read_binary_file(std::string file_path, char **bin, int *length) {
    std::ifstream ifs(file_path, std::ios::in | std::ios::binary);
    ifs.seekg(0, std::ios::end);
    *length = ifs.tellg();
    ifs.seekg(0, std::ios::beg);
    *bin = new char[sizeof(char) * (*length)];
    ifs.read(*bin, *length);
    ifs.close();
    return 0;
}

int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
                   hbDNNHandle_t dnn_handle);

int main() {
  // Get model handle
  hbDNNPackedHandle_t packed_dnn_handle;
  hbDNNHandle_t dnn_handle;
  hbDNNInitializeFromFiles(&packed_dnn_handle, &hbm_path, 1);
  const char **model_name_list;
  int model_count = 0;
  hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle);
  hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]);

  // Prepare input and output tensor
  std::vector<hbDNNTensor> input_tensors;
  std::vector<hbDNNTensor> output_tensors;
  int input_count = 0;
  int output_count = 0;
  hbDNNGetInputCount(&input_count, dnn_handle);
  hbDNNGetOutputCount(&output_count, dnn_handle);
  input_tensors.resize(input_count);
  output_tensors.resize(output_count);

  // Initialize and malloc the tensor
  prepare_tensor(input_tensors.data(), output_tensors.data(), dnn_handle);

  // Copy binary input data to input tensor
  int32_t data_length = 0;
  char *y_data = nullptr;
  read_binary_file(data_y_path, &y_data, &data_length);
  // Copy y_data to input tensor
  memcpy(reinterpret_cast<char *>(input_tensors[0].sysMem.virAddr),
         y_data, input_tensors[0].sysMem.memSize);
  free(y_data);
  // Refresh the cached system memory
  hbUCPMemFlush(&(input_tensors[0].sysMem), HB_SYS_MEM_CACHE_CLEAN);
  char *uv_data = nullptr;
  read_binary_file(data_uv_path, &uv_data, &data_length);
  // Copy uv_data to input tensor
  memcpy(reinterpret_cast<char *>(input_tensors[1].sysMem.virAddr),
         uv_data, input_tensors[1].sysMem.memSize);
  free(uv_data);
  // Refresh the cached system memory
  hbUCPMemFlush(&(input_tensors[1].sysMem), HB_SYS_MEM_CACHE_CLEAN);

  // Submit task and wait till it completed
  hbUCPTaskHandle_t task_handle{nullptr};
  hbDNNTensor *output = output_tensors.data();
  // Generate task handle
  hbDNNInferV2(&task_handle, output, input_tensors.data(), dnn_handle);
  // Submit task
  hbUCPSchedParam ctrl_param;
  HB_UCP_INITIALIZE_SCHED_PARAM(&ctrl_param);
  ctrl_param.backend = HB_UCP_BPU_CORE_ANY;
  hbUCPSubmitTask(task_handle, &ctrl_param);
  // Wait task completed
  hbUCPWaitTaskDone(task_handle, 0);

  // Parse inference result and calculate TOP1
  hbUCPMemFlush(&output_tensors[0].sysMem, HB_SYS_MEM_CACHE_INVALIDATE);
  auto result = reinterpret_cast<float *>(output_tensors[0].sysMem.virAddr);
  float max_score = 0.0;
  int label = -1;
  // Find the max score and corresponding label
  for (auto i = 0; i < 1000; i++) {
    float score = result[i];
    if (score > max_score) {
      label = i;
      max_score = score;
    }
  }
  // Output the result
  std::cout << "label: " << label << std::endl;
  hbUCPReleaseTask(task_handle);
  // Free input memory
  for (int i = 0; i < input_count; i++) {
    hbUCPFree(&(input_tensors[i].sysMem));
  }
  // Free input memory
  for (int i = 0; i < output_count; i++) {
    hbUCPFree(&(output_tensors[i].sysMem));
  }
  // Release model
  hbDNNRelease(packed_dnn_handle);
}

// Prepare input tensor and output tensor
int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
                   hbDNNHandle_t dnn_handle) {
  // Get input and output tensor counts
  int input_count = 0;
  int output_count = 0;
  hbDNNGetInputCount(&input_count, dnn_handle);
  hbDNNGetOutputCount(&output_count, dnn_handle);

  hbDNNTensor *input = input_tensor;
  // Get the properties of the input tensor
  for (int i = 0; i < input_count; i++) {
    hbDNNGetInputTensorProperties(&input[i].properties, dnn_handle, i);
    // Calculate the stride of the input tensor
    auto dim_len = input[i].properties.validShape.numDimensions;
    for (int32_t dim_i = dim_len - 1; dim_i >= 0; --dim_i) {
      if (input[i].properties.stride[dim_i] == -1) {
        auto cur_stride =
            input[i].properties.stride[dim_i + 1] *
            input[i].properties.validShape.dimensionSize[dim_i + 1];
        input[i].properties.stride[dim_i] = ALIGN_32(cur_stride);
      }
    }
    // Calculate the memory size of the input tensor and allocate cache memory
    int input_memSize = input[i].properties.stride[0] *
                        input[i].properties.validShape.dimensionSize[0];
    hbUCPMallocCached(&input[i].sysMem, input_memSize, 0);
  }

  hbDNNTensor *output = output_tensor;
  // Get the properties of the input tensor
  for (int i = 0; i < output_count; i++) {
    hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i);
    // Calculate the memory size of the output tensor and allocate cache memory
    int output_memSize = output[i].properties.alignedByteSize;
    hbUCPMallocCached(&output[i].sysMem, output_memSize, 0);

    // Show how to get output name
    const char *output_name;
    hbDNNGetOutputName(&output_name, dnn_handle, i);
  }
  return 0;
}

Resizer输入模型main.cc准备

对于Resizer输入的模型，我们在OE包的samples/ucp_tutorial/dnn/basic_samples/code/02_advanced_samples/roi_infer路径下为您提供了相应的示例。

这里我们对main.cc中的main函数和准备输入输出tensor进行一下解析：

/**
 * Step1: get model handle
 * Step2: setinput data to nv12
 * Step3: prepare roi mem
 * Step4: prepare input and output tensor
 * Step5: run inference
 * Step6: do postprocess with output data for every task
 * Step7: release resources
 */
#include <fstream>
#include <cstring>
#include <iostream>
#include <map>
#include <vector>

#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"

const char *model_file = "model.hbm";
  std::string data_y_path = "ILSVRC2012_val_00000001_y.bin";
  std::string data_uv_path = "ILSVRC2012_val_00000001_uv.bin";

typedef struct Roi {
  int32_t left;
  int32_t top;
  int32_t right;
  int32_t bottom;
} Roi;

int read_image_2_nv12(std::string &y_path, std::string &uv_path,
                      std::vector<hbUCPSysMem> &image_mem, int &input_h,
                      int &input_w);

int prepare_roi_mem(const std::vector<Roi> &rois,
                    std::vector<hbUCPSysMem> &roi_mem);

int prepare_image_tensor(const std::vector<hbUCPSysMem> &image_mem, int input_h,
                         int input_w, hbDNNHandle_t dnn_handle,
                         std::vector<hbDNNTensor> &input_tensor);

int read_binary_file(std::string file_path, char **bin, int *length);

/**
 * prepare roi tensor
 * @param[in] roi_mem: roi mem info
 * @param[in] dnn_handle: dnn handle
 * @param[in] roi_tensor_id: tensor id of roi input in model
 * @param[out] roi_tensor: roi tensor
 */
int prepare_roi_tensor(const hbUCPSysMem *roi_mem, hbDNNHandle_t dnn_handle,
                       int32_t roi_tensor_id, hbDNNTensor *roi_tensor);

/**
 * prepare out tensor
 * @param[in] dnn_handle: dnn handle
 * @param[out] output: output tensor
 */
int prepare_output_tensor(hbDNNHandle_t dnn_handle,
                          std::vector<hbDNNTensor> &output);

int main(int argc, char **argv) {
  // load model
  hbDNNPackedHandle_t packed_dnn_handle;
  hbDNNHandle_t dnn_handle;
  const char **model_name_list;
  int model_count = 0;
  // Step1: get model handle
  hbDNNInitializeFromFiles(&packed_dnn_handle, &model_file, 1);
  hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle);
  hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]);

  // Step2: set input data to nv12
  // In the sample, since the input is a same image, can allocate a memory for
  // reusing. image_mems is to save image data for y and uv.
  std::vector<hbUCPSysMem> image_mems(2);
  // image input size
  int input_h = 224;
  int input_w = 224;
  read_image_2_nv12(data_y_path, data_uv_path, image_mems, input_h, input_w);

  // Step3: prepare roi mem
  /**
   * Suppose to infer 2 roi tasks of data, the number of ROIs to be prepared is
   * also 2.
   */

  // left = 0, top = 0 right = 223, bottom = 223
  Roi roi_1 = {0, 0, 223, 223};
  // left = 1, top = 1, right = 223, bottom = 223
  Roi roi_2 = {1, 1, 223, 223};

  std::vector<Roi> rois;
  rois.push_back(roi_1);
  rois.push_back(roi_2);
  int roi_num = 2;
  std::vector<hbUCPSysMem> roi_mems(2);
  prepare_roi_mem(rois, roi_mems);

  // Step4: prepare input and output tensor
  std::vector<std::vector<hbDNNTensor>> input_tensors(roi_num);
  std::vector<std::vector<hbDNNTensor>> output_tensors(roi_num);

  for (int i = 0; i < roi_num; ++i) {
    // prepare input tensor
    int input_count = 0;
    hbDNNGetInputCount(&input_count, dnn_handle);
    input_tensors[i].resize(input_count);
    // prepare image tensor

    /** Tips:
     * In the sample, all tasks use the same image, so allocate memory to
     * save image. all input tensor can reuse the memory. if your model has
     * different input image, please allocate different memory for all inputs.
     * */
    prepare_image_tensor(image_mems, input_h, input_w,
                         dnn_handle, input_tensors[i]);

    auto roi_tensor_id = 2;
    prepare_roi_tensor(&roi_mems[i], dnn_handle, roi_tensor_id,
                       &input_tensors[i][roi_tensor_id]);

    // prepare output tensor
    int output_count = 0;
    hbDNNGetOutputCount(&output_count, dnn_handle);
    output_tensors[i].resize(output_count);
    prepare_output_tensor(dnn_handle, output_tensors[i]);
  }

  // Step5: run inference
  hbUCPTaskHandle_t task_handle{nullptr};
  /** Tips:
   * In the sample, submit multiple tasks at the same time
   * when taskHandle is nullptr, here create a new task，and
   * when taskHandle is created but not submitted yet, attach new task to the previous which represents multi model task
   * */
  for (int i = 0; i < roi_num; ++i) {
    hbDNNInferV2(&task_handle, output_tensors[i].data(),
                 input_tensors[i].data(), dnn_handle);
  }

  // submit multi tasks
  hbUCPSchedParam infer_ctrl_param;
  HB_UCP_INITIALIZE_SCHED_PARAM(&infer_ctrl_param);
  hbUCPSubmitTask(task_handle, &infer_ctrl_param);
  // wait task done
  hbUCPWaitTaskDone(task_handle, 0);

  // Step6: do postprocess with output data for every task
  // Find the max score and corresponding label
  for (auto roi_idx = 0; roi_idx < roi_num; roi_idx++) {
    auto result = reinterpret_cast<float *>(output_tensors[roi_idx][0].sysMem.virAddr);
    float max_score = 0.0;
    int label = -1;
    for (auto i = 0; i < 1000; i++) {
      float score = result[i];
      if (score > max_score) {
        label = i;
        max_score = score;
      }
    }
    std::cout << "label: " << label << std::endl;
  }

  // Step7: release resources
  // release task handle
  hbUCPReleaseTask(task_handle);

  // free input mem
  for (auto &mem : image_mems) {
    hbUCPFree(&mem);
  }
  for (auto &mem : roi_mems) {
    hbUCPFree(&mem);
  }

  // free output mem
  for (auto &tensors : output_tensors) {
    for (auto &tensor : tensors) {
      hbUCPFree(&(tensor.sysMem));
    }
  }

  // release model
  hbDNNRelease(packed_dnn_handle);

  return 0;
}

#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)

int prepare_image_tensor(const std::vector<hbUCPSysMem> &image_mem, int input_h,
                         int input_w, hbDNNHandle_t dnn_handle,
                         std::vector<hbDNNTensor> &input_tensor) {
  // y and uv tensor
  for (int i = 0; i < 2; i++) {
    hbDNNGetInputTensorProperties(&input_tensor[i].properties, dnn_handle, i);
    input_tensor[i].sysMem = image_mem[i];

    /** Tips:
     * roi model should modify input valid shape to input image shape.
     * here the struct of y/uv shape is NHWC
     * */
    input_tensor[i].properties.validShape.dimensionSize[1] = input_h;
    input_tensor[i].properties.validShape.dimensionSize[2] = input_w;
    if (i == 1) {
      // uv input
      input_tensor[i].properties.validShape.dimensionSize[1] /= 2;
      input_tensor[i].properties.validShape.dimensionSize[2] /= 2;
    }

    /** Tips:
     * For input tensor, stride should be set according to real padding
     * of the user's data. And 32 bytes alignment is the requirement of y/uv
     **/
    input_tensor[i].properties.stride[1] =
        ALIGN_32(input_tensor[i].properties.stride[2] *
                 input_tensor[i].properties.validShape.dimensionSize[2]);
    input_tensor[i].properties.stride[0] =
        input_tensor[i].properties.stride[1] *
        input_tensor[i].properties.validShape.dimensionSize[1];
  }

  return 0;
}

int prepare_roi_tensor(const hbUCPSysMem *roi_mem, hbDNNHandle_t dnn_handle,
                       int32_t roi_tensor_id, hbDNNTensor *roi_tensor) {
  hbDNNGetInputTensorProperties(&roi_tensor->properties, dnn_handle, roi_tensor_id);

  roi_tensor->sysMem = *roi_mem;
  return 0;
}

int prepare_output_tensor(hbDNNHandle_t dnn_handle,
                          std::vector<hbDNNTensor> &output) {
  for (size_t i = 0; i < output.size(); i++) {
    hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i);
    hbUCPMallocCached(&output[i].sysMem, output[i].properties.alignedByteSize, 0);
  }
  return 0;
}

int read_binary_file(std::string file_path, char **bin, int *length) {
    std::ifstream ifs(file_path, std::ios::in | std::ios::binary);
    ifs.seekg(0, std::ios::end);
    *length = ifs.tellg();
    ifs.seekg(0, std::ios::beg);
    *bin = new char[sizeof(char) * (*length)];
    ifs.read(*bin, *length);
    ifs.close();
    return 0;
}

/** You can define read_image_2_other_type to prepare your data **/
int read_image_2_nv12(std::string &y_path, std::string &uv_path,
                      std::vector<hbUCPSysMem> &image_mem, int &input_h,
                      int &input_w) {

  // copy y data
  auto w_stride = ALIGN_32(input_w);
  int32_t y_mem_size = input_h * w_stride;
  hbUCPMallocCached(&image_mem[0], y_mem_size, 0);
  uint8_t *y_data_dst = reinterpret_cast<uint8_t *>(image_mem[0].virAddr);

  int32_t y_data_length = 0;
  char *y_data = nullptr;
  read_binary_file(y_path, &y_data, &y_data_length);
  memcpy(reinterpret_cast<char *>(image_mem[0].virAddr), y_data, y_mem_size);

  // copy uv data
  int32_t uv_height = input_h / 2;
  int32_t uv_width = input_w / 2;
  int32_t uv_mem_size = uv_height * w_stride;
  hbUCPMallocCached(&image_mem[1], uv_mem_size, 0);

  int32_t uv_data_length = 0;
  char *uv_data = nullptr;
  read_binary_file(uv_path, &uv_data, &uv_data_length);
  memcpy(reinterpret_cast<char *>(image_mem[1].virAddr), uv_data, uv_mem_size);

  // make sure cahced mem data is flushed to DDR before inference
  hbUCPMemFlush(&image_mem[0], HB_SYS_MEM_CACHE_CLEAN);
  hbUCPMemFlush(&image_mem[1], HB_SYS_MEM_CACHE_CLEAN);
  free(y_data);
  free(uv_data);
  return 0;
}

int prepare_roi_mem(const std::vector<Roi> &rois,
                    std::vector<hbUCPSysMem> &roi_mem) {
  auto roi_size = rois.size();
  roi_mem.resize(roi_size);
  for (auto i = 0; i < roi_size; ++i) {
    int32_t mem_size = 4 * sizeof(int32_t);
    hbUCPMallocCached(&roi_mem[i], mem_size, 0);
    int32_t *roi_data = reinterpret_cast<int32_t *>(roi_mem[i].virAddr);
    // The order of filling in the corner points of roi tensor is left, top, right, bottom
    roi_data[0] = rois[i].left;
    roi_data[1] = rois[i].top;
    roi_data[2] = rois[i].right;
    roi_data[3] = rois[i].bottom;
    // make sure cahced mem data is flushed to DDR before inference
    hbUCPMemFlush(&roi_mem[i], HB_SYS_MEM_CACHE_CLEAN);
  }
  return 0;
}

板端运行准备

当可执行程序编译完成后，需要对模型的输入进行准备，如果输入为Featuremap类型，我们可以直接使用校准集作为模型输入即可，如果输入为非Featuremap类型，我们需要对数据进行处理后再作为模型输入。

Featuremap输入

对于Featuremap输入，可以参考如下命令，将校准集的npy格式转为bin文件：

import numpy as np

data = np.load("calibration_data.npy")
data.tofile("input.bin")

非Featuremap输入

对于非Featuremap输入，我们需要先对数据进行处理，下方我们分别给出通过python实现和通过C++实现数据处理的示例。

使用python对数据进行处理可参考如下：

import os
import cv2
import PIL
import numpy as np
from PIL import Image

image_path = "./ILSVRC2012_val_00000001.JPEG"


def resize_transformer(image_data: np.array, short_size: int):
    image = Image.fromarray(image_data.astype('uint8'), 'RGB')
    # Specify width, height
    w, h = image.size
    if (w <= h and w == short_size) or (h <= w and h == short_size):
        return np.array(image)
    # I.e., the width of the image is the short side
    if w < h:
        resize_size = (short_size, int(short_size * h / w))
    # I.e., the height of the image is the short side
    else:
        resize_size = (int(short_size * w / h), short_size)
    # Resize the image
    data = np.array(image.resize(resize_size, Image.BILINEAR))
    return data

def center_crop_transformer(image_data: np.array, crop_size: int):
    image = Image.fromarray(image_data.astype('uint8'), 'RGB')
    image_width, image_height = image.size
    crop_height, crop_width = (crop_size, crop_size)
    crop_top = int(round((image_height - crop_height) / 2.))
    crop_left = int(round((image_width - crop_width) / 2.))
    image_data = image.crop((crop_left,
                             crop_top,
                             crop_left + crop_width,
                             crop_top + crop_height))
    return np.array(image_data).astype(np.float32)

def rgb_to_nv12(image_data: np.array):
    r = image_data[:, :, 0]
    g = image_data[:, :, 1]
    b = image_data[:, :, 2]
    y = (0.299 * r + 0.587 * g + 0.114 * b)
    u = (-0.169 * r - 0.331 * g + 0.5 * b + 128)[::2, ::2]
    v = (0.5 * r - 0.419 * g - 0.081 * b + 128)[::2, ::2]
    uv = np.zeros(shape=(u.shape[0], u.shape[1] * 2))
    for i in range(0, u.shape[0]):
        for j in range(0, u.shape[1]):
            uv[i, 2 * j] = u[i, j]
            uv[i, 2 * j + 1] = v[i, j]
    y = y.astype(np.uint8)
    uv = uv.astype(np.uint8)
    return y, uv

if __name__ == '__main__':
    # load the image with PIL method
    pil_image_data = PIL.Image.open(image_path).convert('RGB')
    image_data = np.array(pil_image_data).astype(np.uint8)
    # Resize the image
    image_data = resize_transformer(image_data, 256)
    # Crop the image
    image_data = center_crop_transformer(image_data, 224)
    # Covert format from RGB to nv12
    y, uv = rgb_to_nv12(image_data)
    y.tofile("ILSVRC2012_val_00000001_y.bin")
    uv.tofile("ILSVRC2012_val_00000001_uv.bin")

上方read mode为PIL，如您使用opencv方式，参考如下：

import cv2
import numpy as np

def image2nv12(image): 
    image = image.astype(np.uint8) 
    height, width = image.shape[0], image.shape[1] 
    yuv420p = cv2.cvtColor(image, cv2.COLOR_BGR2YUV_I420).reshape((height * width * 3 // 2, )) 
    y = yuv420p[:height * width] 
    uv_planar = yuv420p[height * width:].reshape((2, height * width // 4)) 
    uv = uv_planar.transpose((1, 0)).reshape((height * width // 2, )) 
    nv12 = np.zeros_like(yuv420p) 
    # y component
    nv12[:height * width] = y
    # uv component, UVUV alternate store         
    nv12[height * width:] = uv 
    # Return separately
    return y, uv        

image = cv2.imread("./image.jpg")
nv12 = image2nv12(image)

您也可以根据上方python实现的数据处理逻辑在板端示例中通过C++实现数据处理，参考如下：

#include <fstream>
#include <iostream>
#include <vector>
#include <cstring>
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
int32_t read_image_2_tensor_as_nv12(std::string &image_file,
                                    hbDNNTensor *input_tensor) {
  // the struct of input shape is NHWC
  int input_h = input_tensor[0].properties.validShape.dimensionSize[1];
  int input_w = input_tensor[0].properties.validShape.dimensionSize[2];
  cv::Mat bgr_mat = cv::imread(image_file, cv::IMREAD_COLOR);
  if (bgr_mat.empty()) {
    std::cout << "image file not exist!" << std::endl;
    return -1;
  }
  // resize
  cv::Mat mat;
  mat.create(input_h, input_w, bgr_mat.type());
  cv::resize(bgr_mat, mat, mat.size(), 0, 0);
  // convert to YUV420
  if (input_h % 2 || input_w % 2) {
    std::cout << "input img height and width must aligned by 2!" << std::endl;
    return -1;
  }
  cv::Mat yuv_mat;
  cv::cvtColor(mat, yuv_mat, cv::COLOR_BGR2YUV_I420);
  uint8_t *yuv_data = yuv_mat.ptr<uint8_t>();
  uint8_t *y_data_src = yuv_data;
  // copy y data
  uint8_t *y_data_dst =
  reinterpret_cast<uint8_t *>(input_tensor[0].sysMem.virAddr);
  for (int32_t h = 0; h < input_h; ++h) {
    memcpy(y_data_dst, y_data_src, input_w);
    y_data_src += input_w;
    // add padding
    y_data_dst += input_tensor[0].properties.stride[1];
  }
  // copy uv data
  int32_t uv_height = input_tensor[1].properties.validShape.dimensionSize[1];
  int32_t uv_width = input_tensor[1].properties.validShape.dimensionSize[2];
  uint8_t *uv_data_dst =
  reinterpret_cast<uint8_t *>(input_tensor[1].sysMem.virAddr);
  uint8_t *u_data_src = yuv_data + input_h * input_w;
  uint8_t *v_data_src = u_data_src + uv_height * uv_width;
  for (int32_t h = 0; h < uv_height; ++h) {
    auto *cur_data = uv_data_dst;
    for (int32_t w = 0; w < uv_width; ++w) {
      *cur_data++ = *u_data_src++;
      *cur_data++ = *v_data_src++;
  }
  // add padding
  uv_data_dst += input_tensor[1].properties.stride[1];
  }
  // make sure memory data is flushed to DDR before inference
  hbUCPMemFlush(&input_tensor[0].sysMem, HB_SYS_MEM_CACHE_CLEAN);
  hbUCPMemFlush(&input_tensor[1].sysMem, HB_SYS_MEM_CACHE_CLEAN);
  return 0;
}

完成模型输入数据准备，正确生成用于板端示例执行推理的binary格式的输入文件后，还需要确保您现在已准备好如下内容：

S100开发板，用于实际执行板端程序运行。
一个可用于板端推理的模型（*.hbm）。
板端程序（main.cc文件及交叉编译生成板端可执行程序）。
板端程序依赖库，为了降低部署成本，您可以直接使用OE包samples/ucp_tutorial/deps_aarch64/ucp/lib文件夹中的内容。

准备好之后，我们将如上程序、模型文件、输入数据及依赖库整合到一起，参考目录结构如下：

horizon
├── input.bin
├── lib
├── model.hbm
└── run_sample

将此整合的文件夹整体拷贝至板端环境下，参考如下指令：

scp -r horizon/ root@{board_ip}:/map/

板端运行

在完成板端运行示例及运行准备之后，我们需要在板端对LD_LIBRARY_PATH进行配置，之后运行如下程序即可完成板端运行推理：

horizon@hobot:/map/horizon# export LD_LIBRARY_PATH=./lib:$LD_LIBRARY_PATH
horizon@hobot:/map/horizon# ./run_sample
...

至此，整个模型部署的全流程相关的原理及流程已经为您介绍完毕。后续章节中，我们以公版ResNet18示例为例，给出一些典型场景下的PTQ通路全流程实践指导供您参考。

页面目录

PTQ转换工具

hb_compile工具

PTQ转换步骤

PTQ转换示例

常见问题及故障处理

附录

开发指南

深入探索

API参考

QAT

模型导出

Horizon算子

常见问题及常见故障

模型推理开发

模型推理API手册

数据结构

功能接口

模型推理工具介绍

hrt_model_exec工具介绍

hbm_infer工具介绍

UCP通用API介绍

数据结构

功能接口

UCP性能分析工具

常见问题及错误码

模型部署原理及流程

模型部署实践指导实例

HMCT API Reference

工具链算子支持约束列表

算子支持列表

算子BPU约束列表

社区优质文章

板端部署

构建板端运行示例

Pyramid输入模型main.cc准备

Resizer输入模型main.cc准备

板端运行准备

Featuremap输入

非Featuremap输入

板端运行

hb_compile工具

QAT

模型导出

Horizon算子

模型推理API手册

数据结构

功能接口

模型推理工具介绍

hrt_model_exec工具介绍

hbm_infer工具介绍

数据结构

功能接口

算子支持列表

算子BPU约束列表

#板端部署

#构建板端运行示例

#Pyramid输入模型main.cc准备

#Resizer输入模型main.cc准备

#板端运行准备

#Featuremap输入

#非Featuremap输入

#板端运行

板端部署

构建板端运行示例

Pyramid输入模型main.cc准备

Resizer输入模型main.cc准备

板端运行准备

Featuremap输入

非Featuremap输入

板端运行