在完成前述模型的一系列转换编译等操作后,在开发板端,我们提供了统一异构计算平台UCP,来帮助您快速完成模型的部署工作,并提供了相关示例。
对于模型的板端部署,可以简单分为三个步骤,构建板端运行示例、板端运行准备及部署后的板端运行。
在板端运行示例阶段,需要进行如下准备工作:
准备板端示例运行所需依赖库,我们在OE包中samples/ucp_tutorial/deps_aarch64
已提供对应内容,建议您使用此目录下的依赖库,快速完成此部分准备工作。
准备板端示例开发相关文件(main.cc)。
准备main.cc文件需要根据模型输入,对main函数和输入Tensor进行调整。
对于Pyramid输入的模型,我们在OE包的samples/ucp_tutorial/dnn/basic_samples/code/00_quick_start
路径下为您提供了相应的示例。这里我们对main.cc中的main函数和准备输入输出tensor进行一下解析:
/**
* Step1: get model handle
* Step2: prepare input and output tensor
* Step3: set input data to input tensor
* Step4: run inference
* Step5: do postprocess with output data
* Step6: release resources
*/
#include <fstream>
#include <iostream>
#include <vector>
#include <cstring>
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)
const char* hbm_path = "model.hbm";
std::string data_y_path = "ILSVRC2012_val_00000001_y.bin";
std::string data_uv_path = "ILSVRC2012_val_00000001_uv.bin";
// Read binary input file
int read_binary_file(std::string file_path, char **bin, int *length) {
std::ifstream ifs(file_path, std::ios::in | std::ios::binary);
ifs.seekg(0, std::ios::end);
*length = ifs.tellg();
ifs.seekg(0, std::ios::beg);
*bin = new char[sizeof(char) * (*length)];
ifs.read(*bin, *length);
ifs.close();
return 0;
}
int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
hbDNNHandle_t dnn_handle);
int main() {
// Get model handle
hbDNNPackedHandle_t packed_dnn_handle;
hbDNNHandle_t dnn_handle;
hbDNNInitializeFromFiles(&packed_dnn_handle, &hbm_path, 1);
const char **model_name_list;
int model_count = 0;
hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle);
hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]);
// Prepare input and output tensor
std::vector<hbDNNTensor> input_tensors;
std::vector<hbDNNTensor> output_tensors;
int input_count = 0;
int output_count = 0;
hbDNNGetInputCount(&input_count, dnn_handle);
hbDNNGetOutputCount(&output_count, dnn_handle);
input_tensors.resize(input_count);
output_tensors.resize(output_count);
// Initialize and malloc the tensor
prepare_tensor(input_tensors.data(), output_tensors.data(), dnn_handle);
// Copy binary input data to input tensor
int32_t data_length = 0;
char *y_data = nullptr;
read_binary_file(data_y_path, &y_data, &data_length);
// Copy y_data to input tensor
memcpy(reinterpret_cast<char *>(input_tensors[0].sysMem.virAddr),
y_data, input_tensors[0].sysMem.memSize);
free(y_data);
// Refresh the cached system memory
hbUCPMemFlush(&(input_tensors[0].sysMem), HB_SYS_MEM_CACHE_CLEAN);
char *uv_data = nullptr;
read_binary_file(data_uv_path, &uv_data, &data_length);
// Copy uv_data to input tensor
memcpy(reinterpret_cast<char *>(input_tensors[1].sysMem.virAddr),
uv_data, input_tensors[1].sysMem.memSize);
free(uv_data);
// Refresh the cached system memory
hbUCPMemFlush(&(input_tensors[1].sysMem), HB_SYS_MEM_CACHE_CLEAN);
// Submit task and wait till it completed
hbUCPTaskHandle_t task_handle{nullptr};
hbDNNTensor *output = output_tensors.data();
// Generate task handle
hbDNNInferV2(&task_handle, output, input_tensors.data(), dnn_handle);
// Submit task
hbUCPSchedParam ctrl_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&ctrl_param);
ctrl_param.backend = HB_UCP_BPU_CORE_ANY;
hbUCPSubmitTask(task_handle, &ctrl_param);
// Wait task completed
hbUCPWaitTaskDone(task_handle, 0);
// Parse inference result and calculate TOP1
hbUCPMemFlush(&output_tensors[0].sysMem, HB_SYS_MEM_CACHE_INVALIDATE);
auto result = reinterpret_cast<float *>(output_tensors[0].sysMem.virAddr);
float max_score = 0.0;
int label = -1;
// Find the max score and corresponding label
for (auto i = 0; i < 1000; i++) {
float score = result[i];
if (score > max_score) {
label = i;
max_score = score;
}
}
// Output the result
std::cout << "label: " << label << std::endl;
hbUCPReleaseTask(task_handle);
// Free input memory
for (int i = 0; i < input_count; i++) {
hbUCPFree(&(input_tensors[i].sysMem));
}
// Free input memory
for (int i = 0; i < output_count; i++) {
hbUCPFree(&(output_tensors[i].sysMem));
}
// Release model
hbDNNRelease(packed_dnn_handle);
}
// Prepare input tensor and output tensor
int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
hbDNNHandle_t dnn_handle) {
// Get input and output tensor counts
int input_count = 0;
int output_count = 0;
hbDNNGetInputCount(&input_count, dnn_handle);
hbDNNGetOutputCount(&output_count, dnn_handle);
hbDNNTensor *input = input_tensor;
// Get the properties of the input tensor
for (int i = 0; i < input_count; i++) {
hbDNNGetInputTensorProperties(&input[i].properties, dnn_handle, i);
// Calculate the stride of the input tensor
auto dim_len = input[i].properties.validShape.numDimensions;
for (int32_t dim_i = dim_len - 1; dim_i >= 0; --dim_i) {
if (input[i].properties.stride[dim_i] == -1) {
auto cur_stride =
input[i].properties.stride[dim_i + 1] *
input[i].properties.validShape.dimensionSize[dim_i + 1];
input[i].properties.stride[dim_i] = ALIGN_32(cur_stride);
}
}
// Calculate the memory size of the input tensor and allocate cache memory
int input_memSize = input[i].properties.stride[0] *
input[i].properties.validShape.dimensionSize[0];
hbUCPMallocCached(&input[i].sysMem, input_memSize, 0);
}
hbDNNTensor *output = output_tensor;
// Get the properties of the input tensor
for (int i = 0; i < output_count; i++) {
hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i);
// Calculate the memory size of the output tensor and allocate cache memory
int output_memSize = output[i].properties.alignedByteSize;
hbUCPMallocCached(&output[i].sysMem, output_memSize, 0);
// Show how to get output name
const char *output_name;
hbDNNGetOutputName(&output_name, dnn_handle, i);
}
return 0;
}
对于Resizer输入的模型,我们在OE包的samples/ucp_tutorial/dnn/basic_samples/code/02_advanced_samples/roi_infer
路径下为您提供了相应的示例。
这里我们对main.cc中的main函数和准备输入输出tensor进行一下解析:
/**
* Step1: get model handle
* Step2: setinput data to nv12
* Step3: prepare roi mem
* Step4: prepare input and output tensor
* Step5: run inference
* Step6: do postprocess with output data for every task
* Step7: release resources
*/
#include <fstream>
#include <cstring>
#include <iostream>
#include <map>
#include <vector>
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
const char *model_file = "model.hbm";
std::string data_y_path = "ILSVRC2012_val_00000001_y.bin";
std::string data_uv_path = "ILSVRC2012_val_00000001_uv.bin";
typedef struct Roi {
int32_t left;
int32_t top;
int32_t right;
int32_t bottom;
} Roi;
int read_image_2_nv12(std::string &y_path, std::string &uv_path,
std::vector<hbUCPSysMem> &image_mem, int &input_h,
int &input_w);
int prepare_roi_mem(const std::vector<Roi> &rois,
std::vector<hbUCPSysMem> &roi_mem);
int prepare_image_tensor(const std::vector<hbUCPSysMem> &image_mem, int input_h,
int input_w, hbDNNHandle_t dnn_handle,
std::vector<hbDNNTensor> &input_tensor);
int read_binary_file(std::string file_path, char **bin, int *length);
/**
* prepare roi tensor
* @param[in] roi_mem: roi mem info
* @param[in] dnn_handle: dnn handle
* @param[in] roi_tensor_id: tensor id of roi input in model
* @param[out] roi_tensor: roi tensor
*/
int prepare_roi_tensor(const hbUCPSysMem *roi_mem, hbDNNHandle_t dnn_handle,
int32_t roi_tensor_id, hbDNNTensor *roi_tensor);
/**
* prepare out tensor
* @param[in] dnn_handle: dnn handle
* @param[out] output: output tensor
*/
int prepare_output_tensor(hbDNNHandle_t dnn_handle,
std::vector<hbDNNTensor> &output);
int main(int argc, char **argv) {
// load model
hbDNNPackedHandle_t packed_dnn_handle;
hbDNNHandle_t dnn_handle;
const char **model_name_list;
int model_count = 0;
// Step1: get model handle
hbDNNInitializeFromFiles(&packed_dnn_handle, &model_file, 1);
hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle);
hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]);
// Step2: set input data to nv12
// In the sample, since the input is a same image, can allocate a memory for
// reusing. image_mems is to save image data for y and uv.
std::vector<hbUCPSysMem> image_mems(2);
// image input size
int input_h = 224;
int input_w = 224;
read_image_2_nv12(data_y_path, data_uv_path, image_mems, input_h, input_w);
// Step3: prepare roi mem
/**
* Suppose to infer 2 roi tasks of data, the number of ROIs to be prepared is
* also 2.
*/
// left = 0, top = 0 right = 223, bottom = 223
Roi roi_1 = {0, 0, 223, 223};
// left = 1, top = 1, right = 223, bottom = 223
Roi roi_2 = {1, 1, 223, 223};
std::vector<Roi> rois;
rois.push_back(roi_1);
rois.push_back(roi_2);
int roi_num = 2;
std::vector<hbUCPSysMem> roi_mems(2);
prepare_roi_mem(rois, roi_mems);
// Step4: prepare input and output tensor
std::vector<std::vector<hbDNNTensor>> input_tensors(roi_num);
std::vector<std::vector<hbDNNTensor>> output_tensors(roi_num);
for (int i = 0; i < roi_num; ++i) {
// prepare input tensor
int input_count = 0;
hbDNNGetInputCount(&input_count, dnn_handle);
input_tensors[i].resize(input_count);
// prepare image tensor
/** Tips:
* In the sample, all tasks use the same image, so allocate memory to
* save image. all input tensor can reuse the memory. if your model has
* different input image, please allocate different memory for all inputs.
* */
prepare_image_tensor(image_mems, input_h, input_w,
dnn_handle, input_tensors[i]);
auto roi_tensor_id = 2;
prepare_roi_tensor(&roi_mems[i], dnn_handle, roi_tensor_id,
&input_tensors[i][roi_tensor_id]);
// prepare output tensor
int output_count = 0;
hbDNNGetOutputCount(&output_count, dnn_handle);
output_tensors[i].resize(output_count);
prepare_output_tensor(dnn_handle, output_tensors[i]);
}
// Step5: run inference
hbUCPTaskHandle_t task_handle{nullptr};
/** Tips:
* In the sample, submit multiple tasks at the same time
* when taskHandle is nullptr, here create a new task,and
* when taskHandle is created but not submitted yet, attach new task to the previous which represents multi model task
* */
for (int i = 0; i < roi_num; ++i) {
hbDNNInferV2(&task_handle, output_tensors[i].data(),
input_tensors[i].data(), dnn_handle);
}
// submit multi tasks
hbUCPSchedParam infer_ctrl_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&infer_ctrl_param);
hbUCPSubmitTask(task_handle, &infer_ctrl_param);
// wait task done
hbUCPWaitTaskDone(task_handle, 0);
// Step6: do postprocess with output data for every task
// Find the max score and corresponding label
for (auto roi_idx = 0; roi_idx < roi_num; roi_idx++) {
auto result = reinterpret_cast<float *>(output_tensors[roi_idx][0].sysMem.virAddr);
float max_score = 0.0;
int label = -1;
for (auto i = 0; i < 1000; i++) {
float score = result[i];
if (score > max_score) {
label = i;
max_score = score;
}
}
std::cout << "label: " << label << std::endl;
}
// Step7: release resources
// release task handle
hbUCPReleaseTask(task_handle);
// free input mem
for (auto &mem : image_mems) {
hbUCPFree(&mem);
}
for (auto &mem : roi_mems) {
hbUCPFree(&mem);
}
// free output mem
for (auto &tensors : output_tensors) {
for (auto &tensor : tensors) {
hbUCPFree(&(tensor.sysMem));
}
}
// release model
hbDNNRelease(packed_dnn_handle);
return 0;
}
#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)
int prepare_image_tensor(const std::vector<hbUCPSysMem> &image_mem, int input_h,
int input_w, hbDNNHandle_t dnn_handle,
std::vector<hbDNNTensor> &input_tensor) {
// y and uv tensor
for (int i = 0; i < 2; i++) {
hbDNNGetInputTensorProperties(&input_tensor[i].properties, dnn_handle, i);
input_tensor[i].sysMem = image_mem[i];
/** Tips:
* roi model should modify input valid shape to input image shape.
* here the struct of y/uv shape is NHWC
* */
input_tensor[i].properties.validShape.dimensionSize[1] = input_h;
input_tensor[i].properties.validShape.dimensionSize[2] = input_w;
if (i == 1) {
// uv input
input_tensor[i].properties.validShape.dimensionSize[1] /= 2;
input_tensor[i].properties.validShape.dimensionSize[2] /= 2;
}
/** Tips:
* For input tensor, stride should be set according to real padding
* of the user's data. And 32 bytes alignment is the requirement of y/uv
**/
input_tensor[i].properties.stride[1] =
ALIGN_32(input_tensor[i].properties.stride[2] *
input_tensor[i].properties.validShape.dimensionSize[2]);
input_tensor[i].properties.stride[0] =
input_tensor[i].properties.stride[1] *
input_tensor[i].properties.validShape.dimensionSize[1];
}
return 0;
}
int prepare_roi_tensor(const hbUCPSysMem *roi_mem, hbDNNHandle_t dnn_handle,
int32_t roi_tensor_id, hbDNNTensor *roi_tensor) {
hbDNNGetInputTensorProperties(&roi_tensor->properties, dnn_handle, roi_tensor_id);
roi_tensor->sysMem = *roi_mem;
return 0;
}
int prepare_output_tensor(hbDNNHandle_t dnn_handle,
std::vector<hbDNNTensor> &output) {
for (size_t i = 0; i < output.size(); i++) {
hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i);
hbUCPMallocCached(&output[i].sysMem, output[i].properties.alignedByteSize, 0);
}
return 0;
}
int read_binary_file(std::string file_path, char **bin, int *length) {
std::ifstream ifs(file_path, std::ios::in | std::ios::binary);
ifs.seekg(0, std::ios::end);
*length = ifs.tellg();
ifs.seekg(0, std::ios::beg);
*bin = new char[sizeof(char) * (*length)];
ifs.read(*bin, *length);
ifs.close();
return 0;
}
/** You can define read_image_2_other_type to prepare your data **/
int read_image_2_nv12(std::string &y_path, std::string &uv_path,
std::vector<hbUCPSysMem> &image_mem, int &input_h,
int &input_w) {
// copy y data
auto w_stride = ALIGN_32(input_w);
int32_t y_mem_size = input_h * w_stride;
hbUCPMallocCached(&image_mem[0], y_mem_size, 0);
uint8_t *y_data_dst = reinterpret_cast<uint8_t *>(image_mem[0].virAddr);
int32_t y_data_length = 0;
char *y_data = nullptr;
read_binary_file(y_path, &y_data, &y_data_length);
memcpy(reinterpret_cast<char *>(image_mem[0].virAddr), y_data, y_mem_size);
// copy uv data
int32_t uv_height = input_h / 2;
int32_t uv_width = input_w / 2;
int32_t uv_mem_size = uv_height * w_stride;
hbUCPMallocCached(&image_mem[1], uv_mem_size, 0);
int32_t uv_data_length = 0;
char *uv_data = nullptr;
read_binary_file(uv_path, &uv_data, &uv_data_length);
memcpy(reinterpret_cast<char *>(image_mem[1].virAddr), uv_data, uv_mem_size);
// make sure cahced mem data is flushed to DDR before inference
hbUCPMemFlush(&image_mem[0], HB_SYS_MEM_CACHE_CLEAN);
hbUCPMemFlush(&image_mem[1], HB_SYS_MEM_CACHE_CLEAN);
free(y_data);
free(uv_data);
return 0;
}
int prepare_roi_mem(const std::vector<Roi> &rois,
std::vector<hbUCPSysMem> &roi_mem) {
auto roi_size = rois.size();
roi_mem.resize(roi_size);
for (auto i = 0; i < roi_size; ++i) {
int32_t mem_size = 4 * sizeof(int32_t);
hbUCPMallocCached(&roi_mem[i], mem_size, 0);
int32_t *roi_data = reinterpret_cast<int32_t *>(roi_mem[i].virAddr);
// The order of filling in the corner points of roi tensor is left, top, right, bottom
roi_data[0] = rois[i].left;
roi_data[1] = rois[i].top;
roi_data[2] = rois[i].right;
roi_data[3] = rois[i].bottom;
// make sure cahced mem data is flushed to DDR before inference
hbUCPMemFlush(&roi_mem[i], HB_SYS_MEM_CACHE_CLEAN);
}
return 0;
}
当可执行程序编译完成后,需要对模型的输入进行准备,如果输入为Featuremap类型,我们可以直接使用校准集作为模型输入即可,如果输入为非Featuremap类型,我们需要对数据进行处理后再作为模型输入。
对于Featuremap输入,可以参考如下命令,将校准集的npy格式转为bin文件:
import numpy as np
data = np.load("calibration_data.npy")
data.tofile("input.bin")
对于非Featuremap输入 ,我们需要先对数据进行处理,下方我们分别给出通过python实现和通过C++实现数据处理的示例。
使用python对数据进行处理可参考如下:
import os
import cv2
import PIL
import numpy as np
from PIL import Image
image_path = "./ILSVRC2012_val_00000001.JPEG"
def resize_transformer(image_data: np.array, short_size: int):
image = Image.fromarray(image_data.astype('uint8'), 'RGB')
# Specify width, height
w, h = image.size
if (w <= h and w == short_size) or (h <= w and h == short_size):
return np.array(image)
# I.e., the width of the image is the short side
if w < h:
resize_size = (short_size, int(short_size * h / w))
# I.e., the height of the image is the short side
else:
resize_size = (int(short_size * w / h), short_size)
# Resize the image
data = np.array(image.resize(resize_size, Image.BILINEAR))
return data
def center_crop_transformer(image_data: np.array, crop_size: int):
image = Image.fromarray(image_data.astype('uint8'), 'RGB')
image_width, image_height = image.size
crop_height, crop_width = (crop_size, crop_size)
crop_top = int(round((image_height - crop_height) / 2.))
crop_left = int(round((image_width - crop_width) / 2.))
image_data = image.crop((crop_left,
crop_top,
crop_left + crop_width,
crop_top + crop_height))
return np.array(image_data).astype(np.float32)
def rgb_to_nv12(image_data: np.array):
r = image_data[:, :, 0]
g = image_data[:, :, 1]
b = image_data[:, :, 2]
y = (0.299 * r + 0.587 * g + 0.114 * b)
u = (-0.169 * r - 0.331 * g + 0.5 * b + 128)[::2, ::2]
v = (0.5 * r - 0.419 * g - 0.081 * b + 128)[::2, ::2]
uv = np.zeros(shape=(u.shape[0], u.shape[1] * 2))
for i in range(0, u.shape[0]):
for j in range(0, u.shape[1]):
uv[i, 2 * j] = u[i, j]
uv[i, 2 * j + 1] = v[i, j]
y = y.astype(np.uint8)
uv = uv.astype(np.uint8)
return y, uv
if __name__ == '__main__':
# load the image with PIL method
pil_image_data = PIL.Image.open(image_path).convert('RGB')
image_data = np.array(pil_image_data).astype(np.uint8)
# Resize the image
image_data = resize_transformer(image_data, 256)
# Crop the image
image_data = center_crop_transformer(image_data, 224)
# Covert format from RGB to nv12
y, uv = rgb_to_nv12(image_data)
y.tofile("ILSVRC2012_val_00000001_y.bin")
uv.tofile("ILSVRC2012_val_00000001_uv.bin")
上方read mode为PIL,如您使用opencv方式,参考如下:
import cv2
import numpy as np
def image2nv12(image):
image = image.astype(np.uint8)
height, width = image.shape[0], image.shape[1]
yuv420p = cv2.cvtColor(image, cv2.COLOR_BGR2YUV_I420).reshape((height * width * 3 // 2, ))
y = yuv420p[:height * width]
uv_planar = yuv420p[height * width:].reshape((2, height * width // 4))
uv = uv_planar.transpose((1, 0)).reshape((height * width // 2, ))
nv12 = np.zeros_like(yuv420p)
# y component
nv12[:height * width] = y
# uv component, UVUV alternate store
nv12[height * width:] = uv
# Return separately
return y, uv
image = cv2.imread("./image.jpg")
nv12 = image2nv12(image)
您也可以根据上方python实现的数据处理逻辑在板端示例中通过C++实现数据处理,参考如下:
#include <fstream>
#include <iostream>
#include <vector>
#include <cstring>
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
int32_t read_image_2_tensor_as_nv12(std::string &image_file,
hbDNNTensor *input_tensor) {
// the struct of input shape is NHWC
int input_h = input_tensor[0].properties.validShape.dimensionSize[1];
int input_w = input_tensor[0].properties.validShape.dimensionSize[2];
cv::Mat bgr_mat = cv::imread(image_file, cv::IMREAD_COLOR);
if (bgr_mat.empty()) {
std::cout << "image file not exist!" << std::endl;
return -1;
}
// resize
cv::Mat mat;
mat.create(input_h, input_w, bgr_mat.type());
cv::resize(bgr_mat, mat, mat.size(), 0, 0);
// convert to YUV420
if (input_h % 2 || input_w % 2) {
std::cout << "input img height and width must aligned by 2!" << std::endl;
return -1;
}
cv::Mat yuv_mat;
cv::cvtColor(mat, yuv_mat, cv::COLOR_BGR2YUV_I420);
uint8_t *yuv_data = yuv_mat.ptr<uint8_t>();
uint8_t *y_data_src = yuv_data;
// copy y data
uint8_t *y_data_dst =
reinterpret_cast<uint8_t *>(input_tensor[0].sysMem.virAddr);
for (int32_t h = 0; h < input_h; ++h) {
memcpy(y_data_dst, y_data_src, input_w);
y_data_src += input_w;
// add padding
y_data_dst += input_tensor[0].properties.stride[1];
}
// copy uv data
int32_t uv_height = input_tensor[1].properties.validShape.dimensionSize[1];
int32_t uv_width = input_tensor[1].properties.validShape.dimensionSize[2];
uint8_t *uv_data_dst =
reinterpret_cast<uint8_t *>(input_tensor[1].sysMem.virAddr);
uint8_t *u_data_src = yuv_data + input_h * input_w;
uint8_t *v_data_src = u_data_src + uv_height * uv_width;
for (int32_t h = 0; h < uv_height; ++h) {
auto *cur_data = uv_data_dst;
for (int32_t w = 0; w < uv_width; ++w) {
*cur_data++ = *u_data_src++;
*cur_data++ = *v_data_src++;
}
// add padding
uv_data_dst += input_tensor[1].properties.stride[1];
}
// make sure memory data is flushed to DDR before inference
hbUCPMemFlush(&input_tensor[0].sysMem, HB_SYS_MEM_CACHE_CLEAN);
hbUCPMemFlush(&input_tensor[1].sysMem, HB_SYS_MEM_CACHE_CLEAN);
return 0;
}
完成模型输入数据准备,正确生成用于板端示例执行推理的binary格式的输入文件后,还需要确保您现在已准备好如下内容:
S100开发板,用于实际执行板端程序运行。
一个可用于板端推理的模型(*.hbm)。
板端程序(main.cc文件及交叉编译生成板端可执行程序)。
板端程序依赖库,为了降低部署成本,您可以直接使用OE包samples/ucp_tutorial/deps_aarch64/ucp/lib
文件夹中的内容。
准备好之后,我们将如上程序、模型文件、输入数据及依赖库整合到一起,参考目录结构如下:
horizon
├── input.bin
├── lib
├── model.hbm
└── run_sample
将此整合的文件夹整体拷贝至板端环境下,参考如下指令:
scp -r horizon/ root@{board_ip}:/map/
在完成板端运行示例及运行准备之后,我们需要在板端对LD_LIBRARY_PATH进行配置,之后运行如下程序即可完成板端运行推理:
horizon@hobot:/map/horizon# export LD_LIBRARY_PATH=./lib:$LD_LIBRARY_PATH
horizon@hobot:/map/horizon# ./run_sample
...
至此,整个模型部署的全流程相关的原理及流程已经为您介绍完毕。后续章节中,我们以公版ResNet18示例为例,给出一些典型场景下的PTQ通路全流程实践指导供您参考。