"); //-->
| optimized_float.onnx | 浮点 | 图优化,例如 BN 融合到 Conv |
| calibrated.onnx | 伪量化 | 插入校准节点,并基于校准数据计算统计到每个节点的量化参数 |
| ptq.onnx | 查表算子定点 + 其他算子伪量化 | 将查表算子定点化 |
| quantized.bc | 定点 | 整个模型定点化,并转换为地平线 hbir 中间表达 |
| hbm | 指令集 | 经过编译后的最终部署模型 |
# 校准参数组 calibration_parameters: quant_config: './quant_config.json'
my_qconfig_setter=QconfigSetter(
reference_qconfig=get_qconfig(observer=MSEObserver),
templates=[
ModuleNameTemplate({ "":qint16,}),
ModuleNameTemplate({"quant":torch.float16}),
...
],
save_dir="./work_dir",
)import onnx
from hbdk4.compiler.onnx import export
from hbdk4.compiler import convert, save, compile
# 经过PTQ校准得到的伪量化onnx模型,非线性的查表算子已定点
ptq_model = onnx.load("xxx_ptq_model.onnx")
# 导出查表算子定点+其他算子伪量化的hbir模型
qat_bc = export(ptq_model)
save(qat_bc, "qat.bc")
# 导出全定点hbir模型
quantized_bc = convert(qat_bc, "nash-b")
save(quantized_bc, "quantized.bc")
# 编译生成hbm模型
compile(
m=quantized_bc,
path="model.hbm",
opt=2,
march="nash-b",
progress_bar=True,
input_no_padding=True,
output_no_padding=True
)from horizon_plugin_pytorch.quantization.hbdk4 import export def export( model: nn.Module, example_inputs: Any, name: str = "forward", input_names: Optional[Any] = None, # 建议在模型导出时就配置好输入输出节点名称 output_names: Optional[Any] = None, input_descs: Optional[Any] = None, output_descs: Optional[Any] = None, native_pytree: bool = True, ) -> Module
from hbdk4.compiler import load, convert
qat_bc = load("qat.bc")
func = qat_bc[0]
batch_input = ["input_name1"] # 需要使用独立地址方式部署的输入节点名称列表
resizer_input = ["resize"] # 部署时数据来源于resizer的输入节点名称列表
pyramid_input = ["pym"] # 部署时数据来源于pyramid的输入节点名称列表
def channge_source(input, source):
node = input.insert_transpose(permutes=[0, 3, 1, 2])
node = node.insert_image_preprocess(mode="yuvbt601full2bgr", divisor=1, mean=[128, 128, 128], std=[128, 128, 128])
if source == "pyramid":
node.insert_image_convert("nv12")
elif source == "resizer":
node.insert_roi_resize("nv12")
for input in func.inputs[::-1]:
if input.name in batch_input:
origin_name = input.name
split_inputs = input.insert_split(dim=0)
for split_input in reversed(split_inputs):
if origin_name in pyramid_input:
channge_source(split_input, "pyramid")
elif origin_name in resizer_input:
channge_source(split_input, "resizer")from hbdk4.compiler import hbm_perf hbm_perf(model="xxx.hbm", output_dir="./")
from hbdk4.compiler import load
quantized_bc = load("quantized.bc")
quantized_bc[0].remove_io_op(op_types=["Quantize", "Dequantize"])// 1. 加载模型并获取模型名称列表以及Handle
{
hbDNNInitializeFromFiles(&packed_dnn_handle, &modelFileName, 1);
hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle);
hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]);
}
// 2. 根据模型的输入输出信息准备张量
std::vector<hbDNNTensor> input_tensors;
std::vector<hbDNNTensor> output_tensors;
int input_count = 0;
int output_count = 0;
{
hbDNNGetInputCount(&input_count, dnn_handle);
hbDNNGetOutputCount(&output_count, dnn_handle);
input_tensors.resize(input_count);
output_tensors.resize(output_count);
prepare_tensor(input_tensors.data(), output_tensors.data(), dnn_handle);
}
// 3. 准备输入数据并填入对应的张量中
{
read_data_2_tensor(input_data, input_tensors);
// 确保更新输入后进行Flush操作以确保BPU使用正确的数据
for (int i = 0; i < input_count; i++) {
hbUCPMemFlush(&input_tensors[i].sysMem, HB_SYS_MEM_CACHE_CLEAN);
}
}
// 4. 创建任务并进行推理
{
// 创建任务
hbDNNInferV2(&task_handle, output_tensors.data(), input_tensors.data(), dnn_handle)
// 提交任务
hbUCPSchedParam sched_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&sched_param);
sched_param.backend = HB_UCP_BPU_CORE_ANY;
hbUCPSubmitTask(task_handle, &sched_param);
// 等待任务完成
hbUCPWaitTaskDone(task_handle, 0);
}
// 5. 处理输出数据
{
// 确保处理输出前进行Flush操作以确保读取的不是缓存中的脏数据
for (int i = 0; i < output_count; i++) {
hbUCPMemFlush(&output_tensors[i].sysMem, HB_SYS_MEM_CACHE_INVALIDATE);
}
// 对输出进行后处理操作
}
// 6. 释放资源
{
// 释放任务
hbUCPReleaseTask(task_handle);
// 释放输入内存
for (int i = 0; i < input_count; i++) {
hbUCPFree(&(input_tensors[i].sysMem));
}
// 释放输出内存
for (int i = 0; i < output_count; i++) {
hbUCPFree(&(output_tensors[i].sysMem));
}
// 释放模型
hbDNNRelease(packed_dnn_handle);
}# 设置环境变量
# arch代表架构类型,aarch64或x86
arch=aarch64
bin=../$arch/bin/hrt_model_exec
lib=../$arch/lib/
export LD_LIBRARY_PATH=${lib}:${LD_LIBRARY_PATH}
# 获取模型信息
${bin} model_info --model_file=xxx.hbm
# 模型单帧推理
${bin} infer --model_file=xxx.hbm --input_file=xxx.bin
# 模型性能评测-Latency(单线程)
${bin} perf --model_file=xxx.hbm --thread_num 1 --frame_count=1000
# 模型性能评测-FPS(多线程)
${bin} perf --model_file=xxx.hbm --thread_num 8 --frame_count=1000Resizer 输入的 HW 动态,是因为原始输入的大小可以是任意的;
Pyramid/Resizer 输入的 stride 动态,可以理解为是支持 Crop 功能,详细内容可参考用户手册《统一计算平台-模型推理开发-基础示例包使用说明-advanced_samples-crop》
hbDNNTensor *input = input_tensor;
for (int i = 0; i < input_count; i++) {
HB_CHECK_SUCCESS(
hbDNNGetInputTensorProperties(&input[i].properties, dnn_handle, i),
"hbDNNGetInputTensorProperties failed");
auto dim_len = input[i].properties.validShape.numDimensions; // 获取维度信息
for (int32_t dim_i = dim_len - 1; dim_i >= 0; --dim_i) {
if (input[i].properties.stride[dim_i] == -1) { // stride=-1即为动态
auto cur_stride = // 计算当前维度stride
input[i].properties.stride[dim_i + 1] *
input[i].properties.validShape.dimensionSize[dim_i + 1];
input[i].properties.stride[dim_i] = ALIGN_32(cur_stride); // 32对齐
}
}
int input_memSize = input[i].properties.stride[0] * // 计算内存大小
input[i].properties.validShape.dimensionSize[0];
HB_CHECK_SUCCESS(hbUCPMallocCached(&input[i].sysMem[0], input_memSize, 0),
"hbUCPMallocCached failed");
const char *input_name;
HB_CHECK_SUCCESS(hbDNNGetInputName(&input_name, dnn_handle, i), // 获取节点名称
"hbDNNGetInputName failed");
}ucp_tutorial/dnn/basic_samples/code/00_quick_start/resnet_nv12/src/main.cc
#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)
int prepare_image_tensor(const std::vector<hbUCPSysMem> &image_mem, int input_h,
int input_w, hbDNNHandle_t dnn_handle,
std::vector<hbDNNTensor> &input_tensor) {
// 准备Y、UV输入tensor
for (int i = 0; i < 2; i++) {
HB_CHECK_SUCCESS(hbDNNGetInputTensorProperties(&input_tensor[i].properties,
dnn_handle, i),
"hbDNNGetInputTensorProperties failed");
// auto w_stride = ALIGN_32(input_w);
// int32_t y_mem_size = input_h * w_stride;
input_tensor[i].sysMem[0] = image_mem[i];
// 配置原图大小,NHWC
input_tensor[i].properties.validShape.dimensionSize[1] = input_h;
input_tensor[i].properties.validShape.dimensionSize[2] = input_w;
if (i == 1) {
// UV输入大小为Y的1/2
input_tensor[i].properties.validShape.dimensionSize[1] /= 2;
input_tensor[i].properties.validShape.dimensionSize[2] /= 2;
}
// stride满足32对齐
input_tensor[i].properties.stride[1] =
ALIGN_32(input_tensor[i].properties.stride[2] *
input_tensor[i].properties.validShape.dimensionSize[2]);
input_tensor[i].properties.stride[0] =
input_tensor[i].properties.stride[1] *
input_tensor[i].properties.validShape.dimensionSize[1];
}
return 0;
}
// 准备roi输入tensor
int prepare_roi_tensor(const hbUCPSysMem *roi_mem, hbDNNHandle_t dnn_handle,
int32_t roi_tensor_id, hbDNNTensor *roi_tensor) {
HB_CHECK_SUCCESS(hbDNNGetInputTensorProperties(&roi_tensor->properties,
dnn_handle, roi_tensor_id),
"hbDNNGetInputTensorProperties failed");
roi_tensor->sysMem[0] = *roi_mem;
return 0;
}
int prepare_roi_mem(const std::vector<hbDNNRoi> &rois,
std::vector<hbUCPSysMem> &roi_mem) {
auto roi_size = rois.size();
roi_mem.resize(roi_size);
for (auto i = 0; i < roi_size; ++i) {
int32_t mem_size = 4 * sizeof(int32_t);
HB_CHECK_SUCCESS(hbUCPMallocCached(&roi_mem[i], mem_size, 0),
"hbUCPMallocCached failed");
int32_t *roi_data = reinterpret_cast<int32_t *>(roi_mem[i].virAddr);
roi_data[0] = rois[i].left;
roi_data[1] = rois[i].top;
roi_data[2] = rois[i].right;
roi_data[3] = rois[i].bottom;
hbUCPMemFlush(&roi_mem[i], HB_SYS_MEM_CACHE_CLEAN);
}
return 0;
}ucp_tutorial/dnn/basic_samples/code/02_advanced_samples/roi_infer/src/roi_infer.cc
// 获取模型指针并存储
std::vector<hbDNNHandle_t> model_handles;
// 准备各个模型的输入输出,准备过程省略
std::vector<std::vector<hbDNNTensor>> inputs;
std::vector<std::vector<hbDNNTensor>> outputs;
// 创建任务并进行推理
{
// 创建并添加任务,复用task_handle
hbUCPTaskHandle_t task_handle{nullptr};
for(size_t task_id{0U}; task_id < inputs.size(); task_id++){
hbDNNInferV2(&task_handle, outputs[task_id].data(), inputs[task_id].data(), model_handles[i]);
}
// 提交任务
hbUCPSchedParam sche_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&sche_param);
sche_param.backend = HB_UCP_BPU_CORE_ANY;
hbUCPSubmitTask(task_handle, &sche_param);
// 等待任务完成
hbUCPWaitTaskDone(task_handle, 0);
}priority > customId > submit_time(任务提交时间)
priority 支持 [0, 255],对于模型任务而言:
其他 backend 任务,priority 支持 [0, 255],但不支持抢占,可以认为都是普通优先级
import numpy as np
from horizon_tc_ui.hb_runtime import HBRuntime
sess = HBRuntime("quantized.bc")
input_names = sess.input_names
output_names = sess.output_names
data1 = np.load("input1.npy")
data2 = np.load("input2.npy")
input_feed = {input_names[0]: data1, input_names[1]: data2}
output = sess.run(output_names, input_feed)import numpy as np
from hbdk4.compiler import load
hbir = load("quantized.bc")
func = hbir[0]
data1 = np.load("input1.npy")
data2 = np.load("input2.npy")
input_feed = {inputs[0].name: data1, inputs[1].name: data2}
hbir_outputs = func.feed(input_feed)# hbm也可传入一个list,推理时通过指定model_name来选择推理哪个模型,推理所用的.so即可只传输一次
hbm_model = HbmRpcSession(
host="xx.xx.xx.xx",
local_hbm_path="xx.hbm",
)
# 打印模型输入输出信息
hbm_model.show_input_output_info()
# 准备输入数据
input_data = {
'img': torch.ones((1, 3, 224, 224), dtype=torch.int8)
}
# 执行推理并返回结果
# 若传入的是list,需要正确指定model_name
# output_data = hbm_model(input_data, model_name=model_name)
output_data = hbm_model(input_data)
print([output_data[k].shape for k in output_data])
# 关闭server
hbm_model.close_server()int test_custom_op(void *input, void *output, void *tm) {
// custom impl
return 0;
}// dsp镜像中注册自定义算子 hb_dsp_register_fn(cmd, test_custom_op, latency)
// 将输入输出的hbUCPSysMem映射为DSP可访问的内存地址
hbUCPSysMem in;
hbUCPMalloc(&in, in_size, 0)
hbDSPAddrMap(&in, &in)
hbUCPSysMem out;
hbUCPMalloc(&out, out_size, 0)
hbDSPAddrMap(&out, &out)
// 创建并提交DSP任务
hbUCPTaskHandle_t taskHandle{nullptr};
hbDSPRpcV2(&taskHandle, &in, &out, cmd)
hbUCPSchedParam ctrl_param;
HB_UCP_INITIALIZE_SCHED_PARAM(&ctrl_param);
ctrl_param.backend = HB_UCP_DSP_CORE_ANY;
hbUCPSubmitTask(task_handle, &ctrl_param);
// 等待任务完成
hbUCPWaitTaskDone(task_handle, 0);*博客内容为网友个人发布,仅代表博主个人观点,如有侵权请联系工作人员删除。