近年来,基于深度学习的人体动作识别的研究越来越多,slowfast模型提出了快慢两通道网络在动作识别数据集上表现十分优异,本文介绍了Slowfast数据准备,如何训练,以及slowfast使用onnx进行推理,着重介绍了Slowfast使用Tensorrt推理,并且使用yolov5deepsort进行人物追踪,以及使用C++ 部署。


1.1 剪裁视频

准备多组视频数据,其中IN_DATA_DIR 为原始视频数据存放目录,OUT_DATA_DIR为目标视频数据存放目录。这一步保证所有视频长度相同




if [[ ! -d "${OUT_DATA_DIR}" ]]; then

  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";

  mkdir -p ${OUT_DATA_DIR}


for video in $(ls -A1 -U ${IN_DATA_DIR}/*)


    for i in {0..10}


      index=$(expr $i \* 10)


      if [ ! -f "${out_name}" ]; then

        ffmpeg -ss ${index} -t 80 -i "${video}" "${out_name}"




1.2 提取关键帧







if [[ ! -d "${OUT_DATA_DIR}" ]]; then

  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";

  mkdir -p ${OUT_DATA_DIR}



for video in $(ls -A1 -U ${IN_DATA_DIR}/*)




  if [[ $video_name = *".webm" ]]; then







  mkdir -p "${out_video_dir}"




  ffmpeg -i "${video}" -r 1 -q:v 1 "${out_name}"



1.3 分割视频




if [[ ! -d "${OUT_DATA_DIR}" ]]; then

  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";

  mkdir -p ${OUT_DATA_DIR}


for video in $(ls -A1 -U ${IN_DATA_DIR}/*)



  if [ ! -f "${out_name}" ]; then

    ffmpeg -ss 0 -t 100 -i "${video}" "${out_name}"



1.4 文件目录

ava  #一级文件夹,用来存放视频信息

person_box_67091280_iou90 #二级文件夹,用来存放目标检测信息文件夹

——ava_detection_train_boxes_and_labels_include_negative_v2.2.csv #二级文件夹下文件,用来存放目标检测的信息,用于训练

——ava_detection_val_boxes_and_labels.csv #二级文件夹下文件,用来存放目标检测的信息,用于测试

ava_action_list_v2.2_for_activitynet_2019.pbtxt #一级文件夹下的文件,用来存放标签信息

ava_val_excluded_timestamps_v2.2.csv #一级文件夹下的文件,用来没有人物的帧,在训练过程中会抛弃这些帧

ava_train_v2.2.csv #一级文件夹下的文件,用来存放训练数据,关键帧的信息

ava_val_v2.2.csv  #一级文件夹下的文件,用来存放验证数据,关键帧的信息

frame_lists  #一级文件夹,存放1.3中生成的




frames  #一级文件夹,存放1.3中生成的










2.1 环境准备

pip install iopath

pip install fvcore

pip install simplejson

pip install pytorchvideo

2.2  detectron2 安装

!python -m pip install pyyaml==5.1

import sys, os, distutils.core

# Note: This is a faster way to install detectron2 in Colab, but it does not include all functionalities.

# See https://detectron2.readthedocs.io/tutorials/install.html for full installation instructions

!git clone 'https://github.com/facebookresearch/detectron2'

dist = distutils.core.run_setup("./detectron2/setup.py")

!python -m pip install {' '.join([f"'{x}'" for x in dist.install_requires])}

sys.path.insert(0, os.path.abspath('./detectron2'))

3.slowfast 训练

3.1 训练

python tools/run_net.py --cfg configs/AVA/SLOWFAST_32x2_R50_SHORT.yaml



  ENABLE: Fasle

  DATASET: ava

  BATCH_SIZE: 8 #64




  CHECKPOINT_FILE_PATH: '/content/SLOWFAST_32x2_R101_50_50.pkl'  #预训练模型地址









  PATH_TO_DATA_DIR: '/content/ava'


  ENABLE: True



  FRAME_DIR: '/content/ava/frames'   #数据准备阶段生成的目录

  FRAME_LIST_DIR: '/content/ava/frame_lists'

  ANNOTATION_DIR: '/content/ava/annotations'












  ALPHA: 4








  DEPTH: 50

  TRANS_FUNC: bottleneck_transform

  STRIDE_1X1: False

  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]

  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]]

  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]]


  LOCATION: [[[], []], [[], []], [[], []], [[], []]]

  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]

  INSTANTIATION: dot_product

  POOL: [[[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]]]





  BASE_LR: 0.1

  LR_POLICY: steps_with_relative_lrs

  STEPS: [0, 10, 15, 20]

  LRS: [1, 0.1, 0.01, 0.001]





  WARMUP_START_LR: 0.000125




  ARCH: slowfast

  MODEL_NAME: SlowFast

  LOSS_FUNC: bce


  HEAD_ACT: sigmoid


  ENABLE: False

  DATASET: ava









3.2 训练过程常见报错

1.slowfast/datasets/ava_helper.py AVA_VALID_FRAMES改为你的视频长度


from pytorchvideo.layers.distributed import ( # noqa

ImportError: cannot import name 'cat_all_gather' from 'pytorchvideo.layers.distributed'


3.pytorchvideo.losses 报错

File "SlowFast/slowfast/models/losses.py", line 11, in

from pytorchvideo.losses.soft_target_cross_entropy import (

ModuleNotFoundError: No module named 'pytorchvideo.losses'


4.slowfast 预测


python tools/run_net.py --cfg demo/AVA/SLOWFAST_32x2_R101_50_50.yaml




while was_read:



    while was_read and len(frames) < seq_length:

        was_read, frame =cap.read()


Step2: 使用yolov5进行目标检测

1.yolov5 推理代码,将sys.path.insert路径和权重路径weights进行更改

import argparse

import os

import platform

import shutil

import time

from pathlib import Path

import sys

import json

sys.path.insert(1, '/content/drive/MyDrive/yolov5/')

import cv2

import torch

import torch.backends.cudnn as cudnn

import numpy as np

import argparse

import time

import cv2

import torch

import torch.backends.cudnn as cudnn

from numpy import random

from models.common import DetectMultiBackend

from utils.augmentations import letterbox

from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging

from utils.torch_utils import select_device

# ####### 参数设置

conf_thres = 0.6

iou_thres = 0.5


imgsz = 640

weights = "/content/yolov5l.pt"

device = '0'

stride = 32

names = ["person"]

import os

def init():

    # Initialize

    global imgsz, device, stride


    device = select_device('0')

    half = device.type != 'cpu'  # half precision only supported on CUDA

    model = DetectMultiBackend(weights, device=device, dnn=False)

    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine

    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    model.half()  # to FP16


    return model

def process_image(model, input_image=None, args=None, **kwargs):

    img0 = input_image

    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]

    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB

    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)

    img = img.half()

    img /= 255.0  # 0 - 255 to 0.0 - 1.0

    if len(img.shape) == 3:

        img = img[None]

    pred = model(img, augment=False, val=True)[0]

    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)


    for i, det in enumerate(pred):  # detections per image

        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh

        if det is not None and len(det):

            # Rescale boxes from img_size to im0 size

            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            for *xyxy, conf, cls in det:

                if cls==0:


    if len(result)==0:

      return None

    return torch.from_numpy(np.array(result))

2.bbox 预处理

def scale_boxes(size, boxes, height, width):


    Scale the short side of the box to size.


        size (int): size to scale the image.

        boxes (ndarray): bounding boxes to peform scale. The dimension is

        `num boxes` x 4.

        height (int): the height of the image.

        width (int): the width of the image.


        boxes (ndarray): scaled bounding boxes.


    if (width <= height and width == size) or (

        height <= width and height == size


        return boxes

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))

        boxes *= float(new_height) / height


        new_width = int(math.floor((float(width) / height) * size))

        boxes *= float(new_width) / width

    return boxes

Step3: 图像预处理

1.Resize 图像尺寸

def scale(size, image):


    Scale the short side of the image to size.


        size (int): size to scale the image.

        image (array): image to perform short side scale. Dimension is

            `height` x `width` x `channel`.


        (ndarray): the scaled image with dimension of

            `height` x `width` x `channel`.


    height = image.shape[0]

    width = image.shape[1]

    # print(height,width)

    if (width <= height and width == size) or (

        height <= width and height == size


        return image

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))


        new_width = int(math.floor((float(width) / height) * size))

    img = cv2.resize(

        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR


    # print(new_width, new_height)

    return img.astype(np.float32)


def tensor_normalize(tensor, mean, std, func=None):


    Normalize a given tensor by subtracting the mean and dividing the std.


        tensor (tensor): tensor to normalize.

        mean (tensor or list): mean value to subtract.

        std (tensor or list): std to divide.


    if tensor.dtype == torch.uint8:

        tensor = tensor.float()

        tensor = tensor / 255.0

    if type(mean) == list:

        mean = torch.tensor(mean)

    if type(std) == list:

        std = torch.tensor(std)

    if func is not None:

        tensor = func(tensor)

    tensor = tensor - mean

    tensor = tensor / std

    return tensor

3.构建slow以及fast 输入数据

主要思路为从64帧图像数据中选取32帧作为fast的输入,再从fast中选取8帧作为slow的输入,并将 T H W C -> C T H W.因此最后fast_pathway维度为(b,3,32,h,w) slow_pathway的维度为(b,3,8,h,w)

def process_cv2_inputs(frames):


    Normalize and prepare inputs as a list of tensors. Each tensor

    correspond to a unique pathway.


        frames (list of array): list of input images (correspond to one clip) in range [0, 255].

        cfg (CfgNode): configs. Details can be found in



    inputs = torch.from_numpy(np.array(frames)).float() / 255

    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])

    # T H W C -> C T H W.

    inputs = inputs.permute(3, 0, 1, 2)

    # Sample frames for num_frames specified.

    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()


    inputs = torch.index_select(inputs, 1, index)

    fast_pathway = inputs

    slow_pathway = torch.index_select(




                0, inputs.shape[1] - 1, inputs.shape[1] // 4



    frame_list = [slow_pathway, fast_pathway]


    inputs = [inp.unsqueeze(0) for inp in frame_list]

    return inputs

5.slowfast onnx 推理

5.1 导出onnx文件

import os

import sys

from collections import OrderedDict

import torch

import argparse

work_root = os.path.split(os.path.realpath(__file__))[0]

from slowfast.config.defaults import get_cfg

import slowfast.utils.checkpoint as cu

from slowfast.models import build_model

def parser_args():

    parser = argparse.ArgumentParser()






            work_root, "/content/drive/MyDrive/SlowFast/demo/AVA/SLOWFAST_32x2_R101_50_50.yaml"),

        help="Path to the config file",






        help='use half mode',







        help='test model file path',





        default=os.path.join(work_root, "/content/SLOWFAST_head.onnx"),

        help='save model file path',


    return parser.parse_args()

def main():

    args = parser_args()


    cfg_file = args.cfg_file

    checkpoint_file = args.checkpoint

    save_checkpoint_file = args.save

    half_flag = args.half

    cfg = get_cfg()


    cfg.TEST.CHECKPOINT_FILE_PATH = checkpoint_file


    print("export pytorch model to onnx!\n")

    device = "cuda:0"

    with torch.no_grad():

        model = build_model(cfg)

        model = model.to(device)


        cu.load_test_checkpoint(cfg, model)

        if half_flag:


        fast_pathway= torch.randn(1, 3, 32, 256, 455)

        slow_pathway= torch.randn(1, 3, 8, 256, 455)


        fast_pathway = fast_pathway.to(device)

        slow_pathway = slow_pathway.to(device)

        inputs = [slow_pathway, fast_pathway]

        for p in model.parameters():

         p.requires_grad = False

        torch.onnx.export(model, (inputs,bbox), save_checkpoint_file, input_names=['slow_pathway','fast_pathway','bbox'],output_names=['output'], opset_version=12)


def onnx_check():

    import onnx

    args = parser_args()


    onnx_model_path = args.save

    model = onnx.load(onnx_model_path)


if __name__ == '__main__':


5.2 onnx 推理

import torch

import math

import onnxruntime

from torchvision.ops import roi_align

import argparse

import os

import platform

import shutil

import time

from pathlib import Path

import sys

import json

sys.path.insert(1, '/content/drive/MyDrive/yolov5/')

import cv2

import torch

import torch.backends.cudnn as cudnn

import numpy as np

import argparse

import time

import cv2

import torch

import torch.backends.cudnn as cudnn

from numpy import random

from models.common import DetectMultiBackend

from utils.augmentations import letterbox

from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging

from utils.torch_utils import select_device

# ####### 参数设置

conf_thres = 0.6

iou_thres = 0.5


imgsz = 640

weights = "/content/yolov5l.pt"

device = '0'

stride = 32

names = ["person"]

import os

def init():

    # Initialize

    global imgsz, device, stride


    device = select_device('0')

    half = device.type != 'cpu'  # half precision only supported on CUDA

    model = DetectMultiBackend(weights, device=device, dnn=False)

    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine

    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    model.half()  # to FP16


    return model

def process_image(model, input_image=None, args=None, **kwargs):

    img0 = input_image

    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]

    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB

    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)

    img = img.half()

    img /= 255.0  # 0 - 255 to 0.0 - 1.0

    if len(img.shape) == 3:

        img = img[None]

    pred = model(img, augment=False, val=True)[0]

    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)


    for i, det in enumerate(pred):  # detections per image

        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh

        if det is not None and len(det):

            # Rescale boxes from img_size to im0 size

            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            for *xyxy, conf, cls in det:

                if cls==0:


    if len(result)==0:

      return None

    for i in range(32-len(result)):


    return torch.from_numpy(np.array(result))

def scale(size, image):


    Scale the short side of the image to size.


        size (int): size to scale the image.

        image (array): image to perform short side scale. Dimension is

            `height` x `width` x `channel`.


        (ndarray): the scaled image with dimension of

            `height` x `width` x `channel`.


    height = image.shape[0]

    width = image.shape[1]

    # print(height,width)

    if (width <= height and width == size) or (

        height <= width and height == size


        return image

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))


        new_width = int(math.floor((float(width) / height) * size))

    img = cv2.resize(

        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR


    # print(new_width, new_height)

    return img.astype(np.float32)

def tensor_normalize(tensor, mean, std, func=None):


    Normalize a given tensor by subtracting the mean and dividing the std.


        tensor (tensor): tensor to normalize.

        mean (tensor or list): mean value to subtract.

        std (tensor or list): std to divide.


    if tensor.dtype == torch.uint8:

        tensor = tensor.float()

        tensor = tensor / 255.0

    if type(mean) == list:

        mean = torch.tensor(mean)

    if type(std) == list:

        std = torch.tensor(std)

    if func is not None:

        tensor = func(tensor)

    tensor = tensor - mean

    tensor = tensor / std

    return tensor

def scale_boxes(size, boxes, height, width):


    Scale the short side of the box to size.


        size (int): size to scale the image.

        boxes (ndarray): bounding boxes to peform scale. The dimension is

        `num boxes` x 4.

        height (int): the height of the image.

        width (int): the width of the image.


        boxes (ndarray): scaled bounding boxes.


    if (width <= height and width == size) or (

        height <= width and height == size


        return boxes

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))

        boxes *= float(new_height) / height


        new_width = int(math.floor((float(width) / height) * size))

        boxes *= float(new_width) / width

    return boxes

def process_cv2_inputs(frames):


    Normalize and prepare inputs as a list of tensors. Each tensor

    correspond to a unique pathway.


        frames (list of array): list of input images (correspond to one clip) in range [0, 255].

        cfg (CfgNode): configs. Details can be found in



    inputs = torch.from_numpy(np.array(frames)).float() / 255

    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])

    # T H W C -> C T H W.

    inputs = inputs.permute(3, 0, 1, 2)

    # Sample frames for num_frames specified.

    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()


    inputs = torch.index_select(inputs, 1, index)

    fast_pathway = inputs

    slow_pathway = torch.index_select(




                0, inputs.shape[1] - 1, inputs.shape[1] // 4



    frame_list = [slow_pathway, fast_pathway]


    inputs = [inp.unsqueeze(0) for inp in frame_list]

    return inputs



slowfast = onnxruntime.InferenceSession('/content/SLOWFAST_32x2_R101_50_50.onnx')


cap = cv2.VideoCapture("/content/atm_125.mp4")


while was_read:



    while was_read and len(frames) < seq_length:

        was_read, frame =cap.read()



    bboxes = process_image(yolov5,frames[64//2])

    if bboxes is not None:

      frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames]

      frames = [scale(256, frame) for frame in frames]

      inputs = process_cv2_inputs(frames)

      if bboxes is not None:

          bboxes = scale_boxes(256,bboxes,1080,1920)

          index_pad = torch.full(

              size=(bboxes.shape[0], 1),




          # Pad frame index for each box.

          bboxes = torch.cat([index_pad, bboxes], axis=1)

      for i in range(len(inputs)):

        inputs[i] = inputs[i].numpy()

      if bboxes is not None:

          outputs = slowfast.run(None, {'slow_pathway': inputs[0],'fast_pathway':inputs[1],'bbox':bboxes})

          for i in range(80):

            if outputs[0][0][i]>0.3:





6 slowfast python Tensorrt 推理

6.1 导出Tensorrt


一开始,本文尝试使用直接将onnx导出为Tensorrt,导出失败,查找原因是因为roi_alignTensorrt中还未实现(roi_align 将在下个版本的Tensorrt中实现)。



6.2 Tensorrt推理代码

import ctypes

import os

import numpy as np

import cv2

import random

import tensorrt as trt

import pycuda.autoinit

import pycuda.driver as cuda

import threading

import time

class TrtInference():

    _batch_size = 1

    def __init__(self, model_path=None, cuda_ctx=None):

        self._model_path = model_path

        if self._model_path is None:

            print("please set trt model path!")


        self.cuda_ctx = cuda_ctx

        if self.cuda_ctx is None:

            self.cuda_ctx = cuda.Device(0).make_context()

        if self.cuda_ctx:


        self.trt_logger = trt.Logger(trt.Logger.INFO)


        self.engine = self._load_engine()


            self.context = self.engine.create_execution_context()

            self.stream = cuda.Stream()

            for index, binding in enumerate(self.engine):

                if self.engine.binding_is_input(binding):

                    batch_shape = list(self.engine.get_binding_shape(binding)).copy()

                    batch_shape[0] = self._batch_size

                    self.context.set_binding_shape(index, batch_shape)

            self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()

        except Exception as e:

            raise RuntimeError('fail to allocate CUDA resources') from e


            if self.cuda_ctx:


    def _load_plugins(self):


    def _load_engine(self):

        with open(self._model_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:

            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):

        host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \

            [], [], [], [], []

        for index, binding in enumerate(self.engine):

            size = trt.volume(self.context.get_binding_shape(index)) * \


            host_mem = cuda.pagelocked_empty(size, np.float32)

            cuda_mem = cuda.mem_alloc(host_mem.nbytes)


            if self.engine.binding_is_input(binding):






        return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

    def destroy(self):

        """Free CUDA memories and context."""

        del self.cuda_outputs

        del self.cuda_inputs

        del self.stream

        if self.cuda_ctx:


            del self.cuda_ctx

    def inference(self, inputs):

        np.copyto(self.host_inputs[0], inputs[0].ravel())

        np.copyto(self.host_inputs[1], inputs[1].ravel())

        if self.cuda_ctx:



            self.cuda_inputs[0], self.host_inputs[0], self.stream)


            self.cuda_inputs[1], self.host_inputs[1], self.stream)






            self.host_outputs[0], self.cuda_outputs[0], self.stream)


            self.host_outputs[1], self.cuda_outputs[1], self.stream)


        if self.cuda_ctx:


        output = [self.host_outputs[0],self.host_outputs[1]]

        return output

class TrtInference_head():

    _batch_size = 1

    def __init__(self, model_path=None, cuda_ctx=None):

        self._model_path = model_path

        if self._model_path is None:

            print("please set trt model path!")


        self.cuda_ctx = cuda_ctx

        if self.cuda_ctx is None:

            self.cuda_ctx = cuda.Device(0).make_context()

        if self.cuda_ctx:


        self.trt_logger = trt.Logger(trt.Logger.INFO)


        self.engine = self._load_engine()


            self.context = self.engine.create_execution_context()

            self.stream = cuda.Stream()

            for index, binding in enumerate(self.engine):

                if self.engine.binding_is_input(binding):

                    batch_shape = list(self.engine.get_binding_shape(binding)).copy()

                    batch_shape[0] = self._batch_size

                    self.context.set_binding_shape(index, batch_shape)

            self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()

        except Exception as e:

            raise RuntimeError('fail to allocate CUDA resources') from e


            if self.cuda_ctx:


    def _load_plugins(self):


    def _load_engine(self):

        with open(self._model_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:

            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):

        host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \

            [], [], [], [], []

        for index, binding in enumerate(self.engine):

            size = trt.volume(self.context.get_binding_shape(index)) * \


            host_mem = cuda.pagelocked_empty(size, np.float32)

            cuda_mem = cuda.mem_alloc(host_mem.nbytes)


            if self.engine.binding_is_input(binding):






        return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

    def destroy(self):

        """Free CUDA memories and context."""

        del self.cuda_outputs

        del self.cuda_inputs

        del self.stream

        if self.cuda_ctx:


            del self.cuda_ctx

    def inference(self, inputs):

        np.copyto(self.host_inputs[0], inputs[0].ravel())

        np.copyto(self.host_inputs[1], inputs[1].ravel())

        if self.cuda_ctx:



            self.cuda_inputs[0], self.host_inputs[0], self.stream)


            self.cuda_inputs[1], self.host_inputs[1], self.stream)






            self.host_outputs[0], self.cuda_outputs[0], self.stream)


        if self.cuda_ctx:


        output = self.host_outputs[0]

        return output

import torch

import math

from torchvision.ops import roi_align

import argparse

import os

import platform

import shutil

import time

from pathlib import Path

import sys

import json

sys.path.insert(1, '/content/drive/MyDrive/yolov5/')

import cv2

import torch

import torch.backends.cudnn as cudnn

import numpy as np

import argparse

import time

import cv2

import torch

import torch.backends.cudnn as cudnn

from numpy import random

from models.common import DetectMultiBackend

from utils.augmentations import letterbox

from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging

from utils.torch_utils import select_device

# ####### 参数设置

conf_thres = 0.89

iou_thres = 0.5


imgsz = 640

weights = "/content/yolov5l.pt"

device = '0'

stride = 32

names = ["person"]

import os

def init():

    # Initialize

    global imgsz, device, stride


    device = select_device('0')

    half = device.type != 'cpu'  # half precision only supported on CUDA

    model = DetectMultiBackend(weights, device=device, dnn=False)

    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine

    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    model.half()  # to FP16


    return model

def process_image(model, input_image=None, args=None, **kwargs):

    img0 = input_image

    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]

    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB

    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)

    img = img.half()

    img /= 255.0  # 0 - 255 to 0.0 - 1.0

    if len(img.shape) == 3:

        img = img[None]

    pred = model(img, augment=False, val=True)[0]

    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)


    for i, det in enumerate(pred):  # detections per image

        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh

        if det is not None and len(det):

            # Rescale boxes from img_size to im0 size

            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            for *xyxy, conf, cls in det:

                if cls==0:


    if len(result)==0:

      return None

    for i in range(32-len(result)):


    return torch.from_numpy(np.array(result))

def scale(size, image):


    Scale the short side of the image to size.


        size (int): size to scale the image.

        image (array): image to perform short side scale. Dimension is

            `height` x `width` x `channel`.


        (ndarray): the scaled image with dimension of

            `height` x `width` x `channel`.


    height = image.shape[0]

    width = image.shape[1]

    # print(height,width)

    if (width <= height and width == size) or (

        height <= width and height == size


        return image

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))


        new_width = int(math.floor((float(width) / height) * size))

    img = cv2.resize(

        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR


    # print(new_width, new_height)

    return img.astype(np.float32)

def tensor_normalize(tensor, mean, std, func=None):


    Normalize a given tensor by subtracting the mean and dividing the std.


        tensor (tensor): tensor to normalize.

        mean (tensor or list): mean value to subtract.

        std (tensor or list): std to divide.


    if tensor.dtype == torch.uint8:

        tensor = tensor.float()

        tensor = tensor / 255.0

    if type(mean) == list:

        mean = torch.tensor(mean)

    if type(std) == list:

        std = torch.tensor(std)

    if func is not None:

        tensor = func(tensor)

    tensor = tensor - mean

    tensor = tensor / std

    return tensor

def scale_boxes(size, boxes, height, width):


    Scale the short side of the box to size.


        size (int): size to scale the image.

        boxes (ndarray): bounding boxes to peform scale. The dimension is

        `num boxes` x 4.

        height (int): the height of the image.

        width (int): the width of the image.


        boxes (ndarray): scaled bounding boxes.


    if (width <= height and width == size) or (

        height <= width and height == size


        return boxes

    new_width = size

    new_height = size

    if width < height:

        new_height = int(math.floor((float(height) / width) * size))

        boxes *= float(new_height) / height


        new_width = int(math.floor((float(width) / height) * size))

        boxes *= float(new_width) / width

    return boxes

def process_cv2_inputs(frames):


    Normalize and prepare inputs as a list of tensors. Each tensor

    correspond to a unique pathway.


        frames (list of array): list of input images (correspond to one clip) in range [0, 255].

        cfg (CfgNode): configs. Details can be found in



    inputs = torch.from_numpy(np.array(frames)).float() / 255

    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])

    # T H W C -> C T H W.

    inputs = inputs.permute(3, 0, 1, 2)

    # Sample frames for num_frames specified.

    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()


    inputs = torch.index_select(inputs, 1, index)

    fast_pathway = inputs

    slow_pathway = torch.index_select(




                0, inputs.shape[1] - 1, inputs.shape[1] // 4



    frame_list = [slow_pathway, fast_pathway]


    inputs = [inp.unsqueeze(0) for inp in frame_list]

    return inputs



slowfast = TrtInference('/content/SLOWFAST_32x2_R101_50_50.engine',None)

head = TrtInference_head('/content/SLOWFAST_head.engine',None)


cap = cv2.VideoCapture("/content/atm_125.mp4")


while was_read:



    while was_read and len(frames) < seq_length:

        was_read, frame =cap.read()



    bboxes = process_image(yolov5,frames[64//2])

    if bboxes is not None:

      frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames]

      frames = [scale(256, frame) for frame in frames]

      inputs = process_cv2_inputs(frames)


      if bboxes is not None:

          bboxes = scale_boxes(256,bboxes,1080,1920)

          index_pad = torch.full(

              size=(bboxes.shape[0], 1),




          # Pad frame index for each box.

          bboxes = torch.cat([index_pad, bboxes], axis=1)

      for i in range(len(inputs)):

        inputs[i] = inputs[i].numpy()

      if bboxes is not None:








          outputs[0] = outputs[0].numpy()

          outputs[1] = outputs[1].numpy()



          for i in range(80):

            if prd[0][i]>0.3:





slow_pathway fast_pathway 经过slowfast主体模型,通过reshaperoi_align 需要的维度,将reshape后的结果,bbox以及相应的参数带入到roi_align中得到head模型需要的输入。

7.slowfast C++ tensorrt 部署

7.1 yolov5 C++ 目标检测

yolov5 本文就不介绍了,我直接使用平台自带的yolov5 tensorrt 代码


7.2  deepsort C++ 目标追踪




#include "deepsort.h"


 DeepSortBox yolov5识别的结果

 DeepSortBox 结构










 img 为原始的



DS->sort(img, DeepSortBox);

7.3    slowfast C++ 目标动作识别














body.onnx以及head.onnx 通过Tensorrt加载,并且开辟Tensorrt推理运行空间,代码如下

void loadheadOnnx(const std::string strModelName)


    Logger gLogger;

    //根据tensorrt pipeline 构建网络

    IBuilder* builder = createInferBuilder(gLogger);


    const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);  

    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);

    parser->parseFromFile(strModelName.c_str(), static_cast(ILogger::Severity::kWARNING));

    IBuilderConfig* config = builder->createBuilderConfig();

    config->setMaxWorkspaceSize(1ULL << 30);    

    m_CudaheadEngine = builder->buildEngineWithConfig(*network, *config);    

    std::string strTrtName = strModelName;

    size_t sep_pos = strTrtName.find_last_of(".");

    strTrtName = strTrtName.substr(0, sep_pos) + ".trt";

    IHostMemory *gieModelStream = m_CudaheadEngine->serialize();

    std::string serialize_str;

    std::ofstream serialize_output_stream;






    m_CudaheadContext = m_CudaheadEngine->createExecutionContext();






Step2: 为输入输出数据开辟空间

body.onnx 输入为slow_pathwayfast_pathway的维度为(B,C,T,H,W),其中slow_pathwayT8,输出为(B,2048,16,29)fast_pathway的维度为32,输出为(B,256,16,29)``,head的输入(32,2048,7,7)(32,256,7,7),输出为(32,80),具体代码实现如下:

 slow_pathway_InputIndex = m_CudaslowfastEngine->getBindingIndex(slow_pathway_NAME);

    fast_pathway_InputIndex = m_CudaslowfastEngine->getBindingIndex(fast_pathway_NAME);

    slow_pathway_OutputIndex = m_CudaslowfastEngine->getBindingIndex(slow_pathway_OUTPUT);

    fast_pathway_OutputIndex = m_CudaslowfastEngine->getBindingIndex(fast_pathway_OUTPUT);

    dims_i = m_CudaslowfastEngine->getBindingDimensions(slow_pathway_InputIndex);


    SDKLOG(INFO) << "slow_pathway dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3]<< " " << dims_i.d[4];

    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3]* dims_i.d[4];

    cudaMalloc(&slowfast_ArrayDevMemory[slow_pathway_InputIndex], size * sizeof(float));

    slowfast_ArrayHostMemory[slow_pathway_InputIndex] = malloc(size * sizeof(float));

    slowfast_ArraySize[slow_pathway_InputIndex]=size* sizeof(float);


    dims_i = m_CudaslowfastEngine->getBindingDimensions(fast_pathway_InputIndex);

    SDKLOG(INFO) << "fast_pathway dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3]<< " " << dims_i.d[4];

    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3]* dims_i.d[4];

    cudaMalloc(&slowfast_ArrayDevMemory[fast_pathway_InputIndex], size * sizeof(float));

    slowfast_ArrayHostMemory[fast_pathway_InputIndex] = malloc(size * sizeof(float));

    slowfast_ArraySize[fast_pathway_InputIndex]=size* sizeof(float);



    dims_i = m_CudaslowfastEngine->getBindingDimensions(slow_pathway_OutputIndex);

    SDKLOG(INFO) << "slow_out dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3];

    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3];

    cudaMalloc(&slowfast_ArrayDevMemory[slow_pathway_OutputIndex], size * sizeof(float));

    slowfast_ArrayHostMemory[slow_pathway_OutputIndex] = malloc(size * sizeof(float));

    slowfast_ArraySize[slow_pathway_OutputIndex]=size* sizeof(float);




    dims_i = m_CudaslowfastEngine->getBindingDimensions(fast_pathway_OutputIndex);

    SDKLOG(INFO) << "fast_out dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3];

    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3];

    cudaMalloc(&slowfast_ArrayDevMemory[fast_pathway_OutputIndex], size * sizeof(float));

    slowfast_ArrayHostMemory[fast_pathway_OutputIndex] = malloc(size * sizeof(float));

    slowfast_ArraySize[fast_pathway_OutputIndex]=size* sizeof(float);





    cudaMalloc(&ROIAlign_ArrayDevMemory[0], size * sizeof(float));

    ROIAlign_ArrayHostMemory[0] = malloc(size * sizeof(float));

    ROIAlign_ArraySize[0]=size* sizeof(float);



    cudaMalloc(&ROIAlign_ArrayDevMemory[1], size * sizeof(float));

    ROIAlign_ArrayHostMemory[1] = malloc(size * sizeof(float));

    ROIAlign_ArraySize[1]=size* sizeof(float);




    cudaMalloc(&ROIAlign_ArrayDevMemory[2], size * sizeof(float));

    ROIAlign_ArrayHostMemory[2] = malloc(size * sizeof(float));

    ROIAlign_ArraySize[2]=size* sizeof(float);


    boxes_data= malloc(size * sizeof(float));

    dims_i = m_CudaheadEngine->getBindingDimensions(0);





  cv::Mat framesimg = img.clone();

        cv::cvtColor(framesimg, framesimg, cv::COLOR_BGR2RGB);

        int height = framesimg.rows;

        int width = framesimg.cols;

        // 对图像进行预处理


        int size=256;

        int new_width = width;

        int new_height = height;

        if ((width <= height && width == size) || (height <= width and height == size)){




            new_width = size;

            new_height = size;


                new_height = int((float(height) / width) * size);


                new_width = int((float(width) / height) * size);


            cv::resize(framesimg, framesimg, cv::Size{new_width,new_height},cv::INTER_LINEAR);


其次之后对图像进行归一化操作,并且按照CTHW的顺序进行排列,其中C为通道,T为图像顺序,H 为图像的长度,W为图像的宽度,由于slowfast有两个输入,一个输入为fast_pathway 32帧的图像,维度为(b,c,T,h,w),其中T32 ,因此需要每两帧添加图像数据到fast_pathway中,另外一个输入为slow_pathway8帧的图像,维度为(b,c,T,h,w),其中T8,因此需要每四帧添加图像数据到slow_pathway 中,具体代码如下

  float *data=(float *)slowfast_ArrayHostMemory[fast_pathway_InputIndex];

        new_width =  framesimg.cols;

        new_height = framesimg.rows;

        for (size_t c = 0; c < 3; c++)


            for (size_t  h = 0; h < new_height; h++)


                for (size_t w = 0; w < new_width; w++)


                    float v=((float)framesimg.at(h, w)[c]) / 255.0f;

                    v -= 0.45;

                    v /= 0.225;

                    data[c*32*256*455+fast_index* new_width * new_height + h * new_width + w] =v;






            data=(float *)slowfast_ArrayHostMemory[slow_pathway_InputIndex];

            for (size_t c = 0; c < 3; c++)


                for (size_t  h = 0; h < new_height; h++)


                    for (size_t w = 0; w < new_width; w++)


                       float v=((float)framesimg.at(h, w)[c]) / 255.0f;

                        v -= 0.45;

                        v /= 0.225;

                        data[c*8*256*455+slow_index* new_width * new_height + h * new_width + w] =v;






Step4: roi_align实现


void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,

                     const int height, const int width, const int channels,

                     const int aligned_height, const int aligned_width, const float * bottom_rois,

                     float* top_data)


    const int output_size = num_rois * aligned_height * aligned_width * channels;

    int idx = 0;

    for (idx = 0; idx < output_size; ++idx)


        int pw = idx % aligned_width;

        int ph = (idx / aligned_width) % aligned_height;

        int c = (idx / aligned_width / aligned_height) % channels;

        int n = idx / aligned_width / aligned_height / channels;  

        float roi_batch_ind = 0;

        float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;

        float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;

        float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;

        float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;

        float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);

        float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);

        float bin_size_h = roi_height / (aligned_height - 1.);

        float bin_size_w = roi_width / (aligned_width - 1.);

        float h = (float)(ph) * bin_size_h + roi_start_h;

        float w = (float)(pw) * bin_size_w + roi_start_w;

        int hstart = fminf(floor(h), height - 2);

        int wstart = fminf(floor(w), width - 2);

        int img_start = roi_batch_ind * channels * height * width;

        if (h < 0 || h >= height || w < 0 || w >= width)  


            top_data[idx] = 0.;




            float h_ratio = h - (float)(hstart);

            float w_ratio = w - (float)(wstart);

            int upleft = img_start + (c * height + hstart) * width + wstart;


            int upright = upleft + 1;

            int downleft = upleft + width;

            int downright = downleft + 1;

            top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)

                + bottom_data[upright] * (1. - h_ratio) * w_ratio

                + bottom_data[downleft] * h_ratio * (1. - w_ratio)

                + bottom_data[downright] * h_ratio * w_ratio;  





首先将 Step3中准备好的数据使用body进行推理,将推理结果使用Step4中的roi_align函数进行提取bbox对应的特征,最后将提取的特征使用head模型进行推理,得到output。具体代码实现如下

cudaMemcpyAsync(slowfast_ArrayDevMemory[slow_pathway_InputIndex], slowfast_ArrayHostMemory[slow_pathway_InputIndex], slowfast_ArraySize[slow_pathway_InputIndex], cudaMemcpyHostToDevice, m_CudaStream);

    cudaMemcpyAsync(slowfast_ArrayDevMemory[fast_pathway_InputIndex], slowfast_ArrayHostMemory[fast_pathway_InputIndex], slowfast_ArraySize[fast_pathway_InputIndex], cudaMemcpyHostToDevice, m_CudaStream);

    m_CudaslowfastContext->enqueueV2(slowfast_ArrayDevMemory , m_CudaStream, nullptr);    

   cudaMemcpyAsync(slowfast_ArrayHostMemory[slow_pathway_OutputIndex], slowfast_ArrayDevMemory[slow_pathway_OutputIndex], slowfast_ArraySize[slow_pathway_OutputIndex], cudaMemcpyDeviceToHost, m_CudaStream);

    cudaMemcpyAsync(slowfast_ArrayHostMemory[fast_pathway_OutputIndex], slowfast_ArrayDevMemory[fast_pathway_OutputIndex], slowfast_ArraySize[fast_pathway_OutputIndex], cudaMemcpyDeviceToHost, m_CudaStream);



    ROIAlignForwardCpu((float*)slowfast_ArrayHostMemory[slow_pathway_OutputIndex], 0.0625, 32,16,29, 2048,7, 7, (float*)boxes_data,       (float*)ROIAlign_ArrayHostMemory[0]);

    ROIAlignForwardCpu((float*)slowfast_ArrayHostMemory[fast_pathway_OutputIndex], 0.0625, 32,16,29, 256,7, 7, (float*)boxes_data,       (float*)ROIAlign_ArrayHostMemory[1]);


    cudaMemcpyAsync(ROIAlign_ArrayDevMemory[0], ROIAlign_ArrayHostMemory[0], ROIAlign_ArraySize[0], cudaMemcpyHostToDevice, m_CudaStream);

    cudaMemcpyAsync(ROIAlign_ArrayDevMemory[1], ROIAlign_ArrayHostMemory[1], ROIAlign_ArraySize[1], cudaMemcpyHostToDevice, m_CudaStream);

    m_CudaheadContext->enqueueV2(ROIAlign_ArrayDevMemory, m_CudaStream, nullptr);

    cudaMemcpyAsync(ROIAlign_ArrayHostMemory[2], ROIAlign_ArrayDevMemory[2], ROIAlign_ArraySize[2], cudaMemcpyDeviceToHost, m_CudaStream);



1. https://blog.csdn.net/y459541195/article/details/126278476

2. https://blog.csdn.net/WhiffeYF/article/details/115581800

3. https://github.com/facebookresearch/SlowFast



