工具链部署实用技巧 7｜模型设计帧率推理时耗时与带宽分析

地平线开发者 | 2025-09-13 20:47:51 阅读：153

发布文章

一、前言

在实际部署智能驾驶方案时，很多不同任务的模型会同时运行，在非正常情况下，模型按设计帧率运行时，每次运行花费的时间会不稳定。在这种情况下，我们要让模型按设计帧率运行，同时实时监测模型推理耗时和带宽使用情况，分析模型耗时不稳定时带宽占了多少。

针对以上情况，本帖将介绍以下内容：

对 hrt_model_exec 工具进行修改，让它能让单个模型按设计帧率运行指定的帧数，还能计算每帧推理耗时和系统时间戳；
运行修改后的 hrt_model_exec 工具和系统软件里的带宽检测工具 hrut_ddr，分析日志，查看模型耗时和带宽的变化情况。
通过示例说明工具的使用和数据分析方法。

二、 hrt_model_exec 魔改方法

OE 开发包的samples/ucp_tutorial/tools/hrt_model_exec目录下提供了hrt_model_exec** 工具的源码，通过修改源码，希望hrt_model_exec **实现以下功能：

通过–perf_fps 实现模型在单核单线程下按照设计帧率运行；
通过–frame_count 控制按照设计帧率推理的总帧数；
log 中输出系统时间戳、当前推理帧数，推理 Latency。

2.1 代码修改

hrt_model_exec** **源码修改涉及以下 main.cpp、parse_util.h、log_util.cpp、function_util.cpp 这 4 个文件，每一部分的修改如下所示：

2.1.1 main.cpp

main.cpp 位于hrt_model_exec/src目录下，主要在其中增加两行代码来新增 perf_fps 参数：

//在代码开头增加perf_fps参数，
DEFINE_int32(perf_fps, 99999999, "run at the set FPS");
//在static int32_t parsing_gflags()函数中增加
flagsparase.FLAGS_perf_fps = FLAGS_perf_fps;

2.1.2 parse_util.h

parse_util.h 位于hrt_model_exec/include/util/目录下，在 typedef struct {}中增加：

int32_t FLAGS_perf_fps{0};

2.1.3 log_util.cpp

log_util.cpp 位于samples/ucp_tutorial/tools/hrt_model_exec/src/util/目录下，修改这里的目的主要是为了运行hrt_model_exec -h时可以显示–perf_fps 的参数：

void print_usage() {
  std::stringstream ss;
  .....
  ss << "[Examples]" << std::endl;
  ss << "---------------------------------------------------------------"
        "------------------------------------------------"
     << std::endl;
  .....
  ss << std::setw(2) << "|" << std::setw(3) << " " << std::setw(32)
     << "--dequantize_process";
  //在这里增加代码，注意增加代码的位置，不然log中打印会错乱
  ss << std::setw(2) << "|" << std::setw(3) << " " << std::setw(32)
     << "--perf_fps";
  ss << std::endl;
  ss << std::setw(3) << " " << std::setw(32) << " ";
  .....
  std::cout << ss.str();
}

2.1.4 function_util.cpp（重点）

function_util.cpp 位于samples/ucp_tutorial/tools/hrt_model_exec/src/util/目录下，重构 model_performance 函数。如下所示：

int32_t model_performance(hbDNNPackedHandle_t packed_dnn_handle,
                          FLAGSParase flagsparase) {
  // 获取模型名称
  const char **model_name_list;
  int32_t model_count = 0;
  HB_CHECK_SUCCESS(
      hbDNNGetModelNameList(&model_name_list, &model_count, packed_dnn_handle),
      "hbDNNGetModelNameList failed");

  if (flagsparase.FLAGS_model_name.empty()) {
    if (model_count == 1) {
      flagsparase.FLAGS_model_name = model_name_list[0];
    } else {
      LOGE("Multiple models found. Please set --model_name.");
      return -1;
    }
  }

  // 获取模型 handle
  hbDNNHandle_t dnn_handle;
  HB_CHECK_SUCCESS(hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle,
                                       flagsparase.FLAGS_model_name.c_str()),
                   "hbDNNGetModelHandle failed");

  // 判断是否为单线程 + perf_fps 场景
  if (flagsparase.FLAGS_thread_num == 1 && flagsparase.FLAGS_perf_fps > 0) {
    // 执行单线程、定帧率推理
    int32_t input_count = 0;
    int32_t output_count = 0;
    int32_t frame_count = flagsparase.FLAGS_frame_count;
    float target_fps = flagsparase.FLAGS_perf_fps;
    float frame_interval_us = 1e6 / target_fps;

    HB_CHECK_SUCCESS(hbDNNGetInputCount(&input_count, dnn_handle),
                     "hbDNNGetInputCount failed");
    HB_CHECK_SUCCESS(hbDNNGetOutputCount(&output_count, dnn_handle),
                     "hbDNNGetOutputCount failed");

    hbDNNTensor *input_tensor = new hbDNNTensor[input_count];
    hbDNNTensor *output_tensor = new hbDNNTensor[output_count];

    HB_CHECK_SUCCESS(
        prepare_input_tensor(input_tensor, dnn_handle, input_count, flagsparase),
        "prepare_input_tensor failed");
    HB_CHECK_SUCCESS(
        prepare_output_tensor(output_tensor, dnn_handle, output_count),
        "prepare_output_tensor failed");

    for (int frame_id = 0; frame_id < frame_count; ++frame_id) {
      uint64_t start_us = CurrentTs();

      hbUCPTaskHandle_t task_handle = nullptr;
      HB_CHECK_SUCCESS(
          hbDNNInferV2(&task_handle, output_tensor, input_tensor, dnn_handle),
          "hbDNNInferV2 failed");

      hbUCPSchedParam sched_param{};
      HB_UCP_INITIALIZE_SCHED_PARAM(&sched_param);
      sched_param.backend = flagsparase.FLAGS_core_mask;
      sched_param.priority = flagsparase.FLAGS_task_priority;

      HB_CHECK_SUCCESS(hbUCPSubmitTask(task_handle, &sched_param),
                       "hbUCPSubmitTask failed");
      HB_CHECK_SUCCESS(hbUCPWaitTaskDone(task_handle,
                                         flagsparase.FLAGS_task_timeout),
                       "hbUCPWaitTaskDone failed");

      uint64_t end_us = CurrentTs();
      float latency_ms = (end_us - start_us) / 1000.0f;
      // std::cout << "[Frame " << frame_id << "] latency = "
      //           << std::fixed << std::setprecision(3)
      //           << latency_ms << " ms" << std::endl;
      // 获取当前系统时间字符串
      auto now = std::chrono::system_clock::now();
      auto in_time_t = std::chrono::system_clock::to_time_t(now);
      auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                    now.time_since_epoch()) %
                1000;

      std::stringstream timestamp_ss;
      timestamp_ss << "[" << std::put_time(std::localtime(&in_time_t), "%F %T")
                  << "." << std::setfill('0') << std::setw(3) << ms.count() << "] ";
      //输出系统时间，frame_id 和推理Latency
      std::cout << timestamp_ss.str()
                << "[Frame " << frame_id << "] latency = "
                << std::fixed << std::setprecision(3)
                << latency_ms << " ms" << std::endl;

      HB_CHECK_SUCCESS(hbUCPReleaseTask(task_handle),
                       "hbUCPReleaseTask failed");

      uint64_t used_us = end_us - start_us;
      if (used_us < frame_interval_us) {
        usleep(static_cast<useconds_t>(frame_interval_us - used_us));
      }
    }

    // 清理内存
    for (int i = 0; i < input_count; ++i) {
      release_tensor(&input_tensor[i]);
    }
    for (int i = 0; i < output_count; ++i) {
      release_tensor(&output_tensor[i]);
    }
    delete[] input_tensor;
    delete[] output_tensor;
  } else {
    // 多线程推理走原逻辑
    run_model_with_multi_thread(dnn_handle, flagsparase.FLAGS_thread_num,
                                flagsparase.FLAGS_frame_count, flagsparase);
  }

  return 0;
}

修改完代码后进行编译：

cd samples/ucp_tutorial/tools/hrt_model_exec/
bash build_aarch64.sh

编译完成后会生成output_shared_J6_aarch64文件夹，其结构如下：

├── aarch64
│   ├── bin
│   │   └── hrt_model_exec #执行程序
│   └── lib #依赖库
│       ├── libdnn.so
│       ├── libhb_arm_rpc.so
│       ├── libhbrt4.so
│       ├── libhbtl_ext_dnn.so
│       ├── libhbtl.so
│       ├── libhbucp.so
│       ├── libhlog_wrapper.so
│       └── libperfetto_sdk.so
└── script
    └── run_hrt_model_exec.sh #执行脚本

使用的时候将上述文件夹传输到板端即可。

2.2 参考命令

#设计帧率20FPS的模型在core 0上单线程运行2000帧
hrt_model_exec perf --model_file model.hbm --perf_fps 20 --frame_count 2000 --core_id 0 2>&1 |tee ./infer.log

输出 log 如下所示：

[2025-04-03 09:36:05.486] [Frame 0] latency = 22.915 ms
[2025-04-03 09:36:05.586] [Frame 1] latency = 22.495 ms
[2025-04-03 09:36:05.686] [Frame 2] latency = 22.488 ms
[2025-04-03 09:36:05.787] [Frame 3] latency = 22.515 ms
[2025-04-03 09:36:05.887] [Frame 4] latency = 22.496 ms
[2025-04-03 09:36:05.987] [Frame 5] latency = 22.658 ms
[2025-04-03 09:36:06.087] [Frame 6] latency = 22.607 ms
.....

三、hrut_ddr 带宽监测工具

hrut_ddr是由系统软件提供的带宽监测工具，其参数和使用示例如下所示：

hrut_ddr 获取请联系地平线支持人员。

Usage: hrut_ddr [OPTION]...
Show and calculate memory throughput through AIX bus in each period.

Mandatory arguments to long options are mandatory for short options too.
   -t, --type     The type of monitoring range.    Supported
                  values for type are(case-insensitive)
                  when multiple type specified, Enclose in quotation marks 
                  e.g. -t "mcu cpu"
                  If the types exceeds 1, a RoundRobin method is used.
                       For accuracy, set as less types as possible
                  e.g. In the first period the mcu data is read, second period the cpu data is read. 
                  The elapsed time get averaged, and each type result in one round put into one table 
                     all  vdo  cam  cpe0  cpe1  cpe2  cpe3  cpelite  
                   gpu  vdsp  peri  his  sram  bpu  mcu  cpu  
                   secland  
                  cpu        only monitor the throughput of CPU master range
                  bpu        only monitor the throughput of BPU master range
                  cam        only monitor the throughput of Camera master range
                  all        monitor the throughput of all range types (default)
                  rr_all     RoundRobin between all range types
   -p, --period   The sample period for monitored range. (unit: us, default: 1000, range:[1000, 2000000])
   -d, --device   The char device of DDR Perf Monitor. [0~5] 0: ddrsys0 mon0, 2 ddrsys1_mon0
                    all (default)
   -n, --number   The sampling period times for monitored range before copying to userspace. (0~400] default: 100
                  !!!When in roundrobin mode, this is forcely set to 1
   -N, --over_all Over_all read times. i.e. Approximately how much tables you get in commands line
   -f, --filename the csv output filename
   -r, --raw      Output raw data, hexadecimal format, without conversion. Decimal by default
   -c, --csv      Output csv format data
   -D, --dissh    Disable shell output


J6P SOC, of 8 DDR systems, 2 monitors each SYS, 1 ports each monitor
Example:
hrut_ddr -t all -p 1000 -d 0
hrut_ddr -t all -p 1000 -r
hrut_ddr -t cpu -p 1000
hrut_ddr -t "cpu mcu" -p 1000 -f "mon0.csv"
hrut_ddr -d "0 1" -p 1000

四、示例说明

本节将以一个简单的示例，阐述如何分析设计帧率推理模型的耗时与带宽。通过示例能直观了解其实际运行的用时和带宽情况，下面将详细说明分析的具体步骤。

4.1 步骤 1 选取模型

智驾方案中一般有很多个模型，建议在实验中选取设计带宽占用比较大的模型，其中设计带宽的计算公式为：

设计带宽=每帧的 ddr 占用*设计带宽

其中每帧的 ddr 占用可以根据编译器的性能预估 perf.html 中计算得到，比如下图中**每帧的 ddr 占用为：40492429354/94.48/1024/1024=408.76MB/**FPS

aW1hZ2U=.png

本帖的示例中选取了 4 个设计带宽比较大的模型 model1～model4。

4.2 步骤 2 模型推理和带宽监测

首先在板端运行 hrut_ddr 工具，参考命令如下：

 ./hrut_ddr -t bpu 2>&1 |tee ./hrut_tool.log

然后在板端打开 4 个终端按照设计帧率分别推理这 4 个 hbm 模型：

如果需要将模型运行在不同的 BPU core 上可以配置 core_id 参数。

hrt_model_exec perf --model_file model1.hbm --perf_fps 20 --frame_count 20000  2>&1 |tee ./model1.log
hrt_model_exec perf --model_file model2.hbm --perf_fps 10 --frame_count 10000  2>&1 |tee ./model2.log
hrt_model_exec perf --model_file model3.hbm --perf_fps 20 --frame_count 20000  2>&1 |tee ./model3.log
hrt_model_exec perf --model_file model4.hbm --perf_fps 10 --frame_count 10000  2>&1 |tee ./model4.log

4.5 步骤 3 数据分析

耗时带宽总体趋势图

将 4 个模型的耗时带宽变化集成在同一个图像上，由此观察变化趋势，如下所示：

这里提供绘图的脚本，此脚本可以实现以下功能：

指定绘图的时间段，hrut_ddr 和 hrt_model_exec 均有系统时间戳；
计算每秒钟的平均耗时和带宽占用；
将多个模型的耗时和带宽集成在同一张图上。

import re
import csv
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# === 配置 ===
bandwidth_log_file =  'hrut_tool.log'
latency_log_files = ['model1.log', 'model2.log', 'model3.log', 'model4.log']
csv_file = 'performance_per_second.csv'
plot_file = 'multi_latency_bandwidth_trend.png'

# 可选时间范围
time_start = '2025-04-03 09:42:00'
time_end = '2025-04-03 09:45:00'
dt_start = datetime.strptime(time_start, '%Y-%m-%d %H:%M:%S')
dt_end = datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S')

# 正则
time_re = re.compile(r'localtime:\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\.(\d{6})')
read_re = re.compile(r'Read\|\s*(\d+)\|\s*\d+')
write_re = re.compile(r'Write\|\s*(\d+)\|\s*\d+')
latency_re = re.compile(r'\[?(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})[.:](\d{3})\]?\s*\[Frame \d+\]\s*latency\s*=\s*([\d.]+)\s*ms')

# === 数据结构 ===
bandwidth_per_sec = defaultdict(list)
latency_per_sec_list = [defaultdict(list) for _ in latency_log_files]

# === 解析带宽日志 ===
with open(bandwidth_log_file, 'r') as f:
    lines = f.readlines()

i = 0
while i < len(lines):
    time_match = time_re.search(lines[i])
    if time_match:
        timestamp = datetime.strptime(time_match.group(1), '%Y-%m-%d %H:%M:%S')
        read = write = 0
        for j in range(i, min(i + 10, len(lines))):
            r = read_re.search(lines[j])
            w = write_re.search(lines[j])
            if r: read = int(r.group(1))
            if w: write = int(w.group(1))
        if dt_start <= timestamp < dt_end:
            key = timestamp.strftime('%Y-%m-%d %H:%M:%S')
            bandwidth_per_sec[key].append(read + write)
        i = j
    else:
        i += 1

# === 解析多个延迟日志 ===
for idx, log_file in enumerate(latency_log_files):
    with open(log_file, 'r') as f:
        for line in f:
            match = latency_re.search(line)
            if match:
                timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
                latency_val = float(match.group(3))
                if dt_start <= timestamp < dt_end:
                    key = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                    latency_per_sec_list[idx][key].append(latency_val)

# === 合并数据并写 CSV ===
all_keys = set(bandwidth_per_sec.keys())
for d in latency_per_sec_list:
    all_keys |= set(d.keys())
all_keys = sorted(all_keys)

with open(csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    headers = ['time', 'avg_bandwidth'] + [f'avg_latency_{i+1}' for i in range(len(latency_log_files))]
    writer.writerow(headers)

    for key in all_keys:
        bw_avg = sum(bandwidth_per_sec[key]) / len(bandwidth_per_sec[key]) if key in bandwidth_per_sec else 0
        row = [key, round(bw_avg, 2)]
        for d in latency_per_sec_list:
            if key in d:
                lat_avg = sum(d[key]) / len(d[key])
            else:
                lat_avg = 0
            row.append(round(lat_avg, 3))
        writer.writerow(row)

print(f"✅ 写入 CSV 完成: {csv_file}")

# === 绘图 ===
times = [datetime.strptime(k, '%Y-%m-%d %H:%M:%S') for k in all_keys]
bandwidth_vals = [sum(bandwidth_per_sec[k]) / len(bandwidth_per_sec[k]) if k in bandwidth_per_sec else 0 for k in all_keys]
latency_series = []
for d in latency_per_sec_list:
    latency_vals = [sum(d[k]) / len(d[k]) if k in d else 0 for k in all_keys]
    latency_series.append(latency_vals)

fig, ax1 = plt.subplots(figsize=(12, 6))

# 左轴：带宽
ax1.plot(times, bandwidth_vals, 'b-o', label='Bandwidth (MiB/s)')
ax1.set_ylabel('Bandwidth (MiB/s)', color='b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
plt.xticks(rotation=45)

# 右轴：多延迟曲线
ax2 = ax1.twinx()
colors = ['r', 'orange', 'purple', 'green']
for i, latency_vals in enumerate(latency_series):
    ax2.plot(times, latency_vals, marker='x', linestyle='--', color=colors[i], label=f'Latency {i+1} (ms)')
ax2.set_ylabel('Latency (ms)', color='r')
ax2.tick_params(axis='y', labelcolor='r')

# 图例
fig.tight_layout()
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

plt.title('Bandwidth and Latency Comparison (per second)')
plt.grid(True)
plt.savefig(plot_file, dpi=150)
plt.show()

print(f"✅ 图像保存为: {plot_file}")

峰值带宽获取

根据 hrut_ddr 工具的 log（如下所示），获取 BPU 带宽占用（Range1:bpu_p0 列）和系统带宽占用（Bandwidth 列）Read+Write 最大的 20 个值和对应的时间戳。

aW1hZ2U=.png

参考代码如下所示：

import re

log_file = "hrut.log"  # 替换为你的日志文件路径

range_entries = []
bandwidth_entries = []

with open(log_file, "r") as f:
    lines = f.readlines()

i = 0
while i < len(lines):
    if lines[i].startswith("Elapsed time"):
        time_match = re.search(r"localtime:\s+([\d\-\s:.]+)", lines[i])
        if time_match:
            current_time = time_match.group(1).strip()
        else:
            current_time = "(unknown time)"

        range_read = range_write = None
        band_read = band_write = None

        # 预计在后面 10 行中会包含读写信息
        for j in range(i+1, i+10):
            if j >= len(lines):
                break
            line = lines[j].strip()
            if line.startswith("Read|"):
                parts = line.split("|")
                if len(parts) >= 3:
                    try:
                        range_read = int(parts[1].strip())
                        band_read = int(parts[2].strip())
                    except ValueError:
                        pass
            elif line.startswith("Write|"):
                parts = line.split("|")
                if len(parts) >= 3:
                    try:
                        range_write = int(parts[1].strip())
                        band_write = int(parts[2].strip())
                    except ValueError:
                        pass

        # 如果两个值都提取到了，则计算总和并存入对应列表
        if range_read is not None and range_write is not None:
            total = range_read + range_write
            range_entries.append((total, current_time))

        if band_read is not None and band_write is not None:
            total = band_read + band_write
            bandwidth_entries.append((total, current_time))

    i += 1

# 分别排序并打印前20项
range_entries.sort(reverse=True, key=lambda x: x[0])
bandwidth_entries.sort(reverse=True, key=lambda x: x[0])

print("前20个 Range1:bpu_p0 的 Read+Write 总和记录：")
for idx, (val, t) in enumerate(range_entries[:20], 1):
    print(f"{idx:2d}. 总和: {val}, 时间: {t}")

print("\n前20个 Bandwidth 的 Read+Write 总和记录：")
for idx, (val, t) in enumerate(bandwidth_entries[:20], 1):
    print(f"{idx:2d}. 总和: {val}, 时间: {t}")

这样，我们就完成了延迟和带宽的数据获取和分析，后续笔者将分享如何根据获取的数据进行调优。

*博客内容为网友个人发布，仅代表博主个人观点，如有侵权请联系工作人员删除。