本章节主要讲解OpenCV在人工智能领域的深度应用,包括深度学习模型推理、目标检测、图像分割、人脸识别、姿态估计、OCR文字识别等高级功能。这些技术是现代计算机视觉的核心,广泛应用于自动驾驶、智能监控、医疗影像、机器人视觉等领域。
DNN(Deep Neural Network)是OpenCV中专门用于深度学习模型推理的模块。它允许我们在OpenCV环境中直接加载和运行经过训练的深度学习模型,而不需要依赖TensorFlow、PyTorch等框架。这意味着部署时会更加轻量级,推理速度更快。
简单来说:DNN模块就是OpenCV调用深度学习模型的"翻译器",它可以把各种框架训练好的模型转换为OpenCV能理解并执行的格式。
支持的主流框架模型格式:
# 安装 OpenCV(确保包含 DNN 模块)
pip install opencv-python
# 验证 DNN 模块是否可用
import cv2
print(cv2.getBuildInformation())
如果输出中包含 "DNN" 相关信息,说明DNN模块已正确安装。
import cv2
import numpy as np
# 第一步:加载模型
# 参数:模型配置文件(.prototxt 或 .onnx)
net = cv2.dnn.readNetFromCaffe('deploy.prototxt', 'model.caffemodel')
# 或者使用 ONNX 格式
net = cv2.dnn.readNetFromONNX('model.onnx')
# 第二步:准备输入数据
# 将图像转换为模型输入格式
# 参数:图像, 缩放因子, 输出尺寸, 通道均值(用于归一化)
blob = cv2.dnn.blobFromImage(image, 1/255.0, (224, 224), (0, 0, 0), swapRB=True)
# 第三步:设置输入
net.setInput(blob)
# 第四步:前向传播(推理)
# 获取输出层的名称
output_layers = net.getUnconnectedOutLayersNames()
# 进行推理
outputs = net.forward(output_layers)
print("推理完成,输出形状:", outputs[0].shape)
参数详解:
blobFromImage 中的 1/255.0 是缩放因子,将像素值从0-255归一化到0-1(224, 224) 是模型期望的输入尺寸(0, 0, 0) 是通道均值,用于减去每个通道的均值(预处理步骤)swapRB=True 表示交换红色和蓝色通道(因为OpenCV使用BGR,而大多数模型使用RGB)目标检测(Object Detection)是计算机视觉中的核心任务之一,它不仅要识别图像中有什么物体,还要标出物体的位置(边界框)。这是许多AI应用的基础,比如自动驾驶需要检测车辆、行人、交通标志等。
目标检测的两个主要指标:
YOLO(You Only Look Once)是目前最流行的实时目标检测算法之一,其特点是检测速度极快,适合实时应用场景。
YOLO的发展历程:
使用OpenCV加载YOLO模型:
import cv2
import numpy as np
# 加载 YOLOv4 模型
net = cv2.dnn.readNetFromDarknet('yolov4.cfg', 'yolov4.weights')
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
# 加载类别名称
with open('coco.names', 'r') as f:
classes = [line.strip() for line in f.readlines()]
# 读取图像
img = cv2.imread('image.jpg')
height, width, _ = img.shape
# 准备输入
blob = cv2.dnn.blobFromImage(img, 1/255.0, (416, 416), swapRB=True)
net.setInput(blob)
# 获取输出层名称
layer_names = net.getLayerNames()
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# 前向传播
outputs = net.forward(output_layers)
# 解析检测结果
conf_threshold = 0.5 # 置信度阈值
nms_threshold = 0.4 # 非极大值抑制阈值
boxes = []
confidences = []
class_ids = []
for output in outputs:
for detection in output:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > conf_threshold:
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
class_ids.append(class_id)
# 非极大值抑制(NMS):去除重复的检测框
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
# 绘制检测结果
for i in indices:
i = i[0] if isinstance(i, (list, np.ndarray)) else i
box = boxes[i]
x, y, w, h = box
label = str(classes[class_ids[i]])
confidence = confidences[i]
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.putText(img, f"{label} {confidence:.2f}", (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imshow('YOLO Detection', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
YOLO原理解释:
YOLO的核心思想是将目标检测问题转化为回归问题。它将输入图像划分为S×S的网格,每个网格负责预测B个边界框及其置信度,以及C个类别的概率。这种设计使得YOLO可以在单次前向传播中完成检测,因此速度极快。
SSD是另一种实时目标检测算法,它在不同尺度的特征图上进行检测,可以很好地检测不同大小的物体。
import cv2
import numpy as np
# 加载 SSD MobileNet 模型(轻量级,适合移动端)
net = cv2.dnn.readNetFromTensorflow('ssd_mobilenet_v2_coco.pb',
'ssd_mobilenet_v2_coco.pbtxt')
# 设置计算后端
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
# 读取图像
img = cv2.imread('image.jpg')
height, width, _ = img.shape
# 准备输入(SSD期望的输入尺寸是300×300)
blob = cv2.dnn.blobFromImage(img, 1.0, (300, 300), (127.5, 127.5, 127.5), swapRB=True)
net.setInput(blob)
# 推理
output = net.forward()
# 解析结果
for detection in output[0, 0, :, :]:
confidence = float(detection[2])
if confidence > 0.5:
class_id = int(detection[1])
x1 = int(detection[3] * width)
y1 = int(detection[4] * height)
x2 = int(detection[5] * width)
y2 = int(detection[6] * height)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.imshow('SSD Detection', img)
cv2.waitKey(0)
图像分割(Image Segmentation)是将图像划分为若干个有意义的区域的技术。与目标检测不同,分割不仅要找到物体的位置,还要精确勾勒出物体的边界像素。
图像分割的三种类型:
DeepLab是Google开发的深度学习分割模型,DeepLabV3+是目前最流行的分割模型之一。
import cv2
import numpy as np
# 加载 DeepLabv3+ 模型
net = cv2.dnn.readNetFromTensorflow('deeplab_v3_plus.pb', 'deeplab_v3_plus.pbtxt')
# 读取图像
img = cv2.imread('image.jpg')
original_h, original_w = img.shape[:2]
# DeepLab输入尺寸
input_size = (513, 513)
# 准备输入
blob = cv2.dnn.blobFromImage(img, 1.0, input_size, (127.5, 127.5, 127.5), swapRB=True)
net.setInput(blob)
# 推理
output = net.forward()
print("输出形状:", output.shape) # 应该是 (1, 21, 513, 513)
# 获取分割掩码
mask = output[0, 0] # 取第一个类别维度
mask = cv2.resize(mask, (original_w, original_h))
# 创建颜色映射(21个类别对应21种颜色)
colors = [
[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
[0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
[64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
[64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
[0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]
]
# 创建彩色分割图
segmentation = np.zeros((original_h, original_w, 3), dtype=np.uint8)
for i in range(original_h):
for j in range(original_w):
class_id = int(mask[i, j])
segmentation[i, j] = colors[class_id]
# 将分割结果叠加到原图上
result = cv2.addWeighted(img, 0.6, segmentation, 0.4, 0)
cv2.imshow('Segmentation', result)
cv2.waitKey(0)
GrabCut是一种交互式图像分割算法,可以精确提取前景对象。
import cv2
import numpy as np
# 读取图像
img = cv2.imread('image.jpg')
# 定义初始矩形(包围前景区域)
rect = (50, 50, 300, 300) # (x, y, width, height)
# 创建掩码
mask = np.zeros(img.shape[:2], np.uint8)
# 创建GrabCut算法所需的背景和前景模型
bgd_model = np.zeros((1, 65), np.float64)
fgd_model = np.zeros((1, 65), np.float64)
# 运行GrabCut算法
# 参数:图像, 掩码, 矩形, 背景模型, 前景模型, 迭代次数, 模式
cv2.grabCut(img, mask, rect, bgd_model, fgd_model, 5, cv2.GC_INIT_WITH_RECT)
# 创建掩码(将可能的前景和确定的前景设为1)
mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
# 应用掩码提取前景
result = img * mask2[:, :, np.newaxis]
cv2.imshow('Original', img)
cv2.imshow('Result', result)
cv2.waitKey(0)
cv2.destroyAllWindows()
人脸识别技术通常分为三个层次:
OpenCV提供了基于Haar Cascade和DNN的人脸检测器。
import cv2
# 方法一:使用 Haar Cascade(传统方法,速度快但精度较低)
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
# 方法二:使用 DNN 模块(基于深度学习,精度高)
face_net = cv2.dnn.readNetFromCaffe('deploy.prototxt', 'res10_300x300_ssd_iter_140000.caffemodel')
# 读取图像
img = cv2.imread('photo.jpg')
h, w = img.shape[:2]
# DNN 人脸检测
blob = cv2.dnn.blobFromImage(img, 1.0, (300, 300), (104.0, 177.0, 123.0))
face_net.setInput(blob)
detections = face_net.forward()
# 解析检测结果
for i in range(detections.shape[2]):
confidence = detections[0, 0, i, 2]
if confidence > 0.5:
x1 = int(detections[0, 0, i, 3] * w)
y1 = int(detections[0, 0, i, 4] * h)
x2 = int(detections[0, 0, i, 5] * w)
y2 = int(detections[0, 0, i, 6] * h)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(img, f'Face: {confidence:.2f}', (x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imshow('Face Detection', img)
cv2.waitKey(0)
面部关键点检测可以找到眼睛、鼻子、嘴巴等面部特征点的位置,这对于人脸对齐、表情分析、AR滤镜等应用非常重要。
import cv2
import numpy as np
# 加载面部关键点检测模型
# 该模型可以检测68个面部关键点
landmark_net = cv2.dnn.readNetFromTensorflow('facial_landmarks.pb', 'facial_landmarks.pbtxt')
img = cv2.imread('face.jpg')
h, w = img.shape[:2]
# 准备输入
blob = cv2.dnn.blobFromImage(img, 1.0 / 255, (96, 96), (0, 0, 0), swapRB=True)
landmark_net.setInput(blob)
# 推理
landmarks = landmark_net.forward()
print("关键点数量:", landmarks.shape[1] // 2) # 应该是68个
# 绘制关键点
for i in range(0, landmarks.shape[1], 2):
x = int(landmarks[0, i] * w)
y = int(landmarks[0, i + 1] * h)
cv2.circle(img, (x, y), 2, (0, 255, 0), -1)
cv2.imshow('Landmarks', img)
cv2.waitKey(0)
人脸识别的核心是将人脸图像转换为特征向量,然后比较两个特征向量的相似度。
import cv2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# 加载人脸识别模型(FaceNet或类似模型)
face_net = cv2.dnn.readNetFromTensorflow('facenet.pb')
def get_face_embedding(face_img):
"""获取人脸的特征向量"""
# 预处理:调整大小、归一化
face_blob = cv2.dnn.blobFromImage(face_img, 1.0 / 255, (160, 160), (0, 0, 0), swapRB=True)
face_net.setInput(face_blob)
embedding = face_net.forward()
return embedding.flatten()
# 加载已知人脸数据库
known_embeddings = []
known_names = []
# 假设数据库中有两个人
# 实际应用中这里应该从文件或数据库加载
# known_embeddings = [...]
# known_names = ["Person1", "Person2"]
# 识别新人脸
def recognize_face(face_img):
embedding = get_face_embedding(face_img)
# 计算与已知人脸的相似度
similarities = cosine_similarity([embedding], known_embeddings)[0]
# 找到最相似的人脸
best_match_idx = np.argmax(similarities)
best_similarity = similarities[best_match_idx]
if best_similarity > 0.7: # 阈值
return known_names[best_match_idx], best_similarity
else:
return "Unknown", best_similarity
姿态估计(Pose Estimation)是检测人体关键点(如头部、肩膀、肘部、手腕、臀部、膝盖、脚踝等)位置的技术。这些关键点连接起来就构成了人体的骨架,可以用来分析人体的动作和姿态。
应用场景:
import cv2
import numpy as np
# 加载 OpenPose 模型(Body25)
# 需要下载 openpose_body25.pbtxt 和 openpose_body25.caffemodel
net = cv2.dnn.readNetFromCaffe('openpose_body25.pbtxt', 'openpose_body25.caffemodel')
# 或者使用更轻量的模型
# net = cv2.dnn.readNetFromCaffe('openposecoco.pbtxt', 'openposecoco.caffemodel')
img = cv2.imread('person.jpg')
h, w = img.shape[:2]
# 准备输入(OpenPose 期望 368×368 输入)
inWidth = 368
inHeight = 368
blob = cv2.dnn.blobFromImage(img, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=True)
net.setInput(blob)
# 推理
output = net.forward()
print("输出形状:", output.shape) # (1, 57, 46, 46)
# 解析关键点
points = []
threshold = 0.1
# COCO 身体关键点索引
# 0: nose, 1: neck, 2: R_shoulder, 3: R_elbow, 4: R_wrist,
# 5: L_shoulder, 6: L_elbow, 7: L_wrist, 8: R_hip, 9: R_knee,
# 10: R_ankle, 11: L_hip, 12: L_knee, 13: L_ankle, 14: R_eye,
# 15: L_eye, 16: R_ear, 17: L_ear
for i in range(output.shape[1]): # 遍历所有关键点
heatMap = output[0, i, :, :]
_, conf, _, point = cv2.minMaxLoc(heatMap)
if conf > threshold:
x = int(point[0] * w / output.shape[3])
y = int(point[1] * h / output.shape[2])
points.append((x, y))
else:
points.append(None)
# 绘制关键点和骨架
# 定义骨架连接
connections = [
(0, 1), (1, 2), (2, 3), (3, 4), # 右臂
(1, 5), (5, 6), (6, 7), # 左臂
(1, 8), (8, 9), (9, 10), # 右腿
(1, 11), (11, 12), (12, 13), # 左腿
(0, 14), (14, 16), # 右眼到右耳
(0, 15), (15, 17) # 左眼到左耳
]
# 画连接线
for p1_idx, p2_idx in connections:
if points[p1_idx] and points[p2_idx]:
cv2.line(img, points[p1_idx], points[p2_idx], (0, 255, 0), 2)
# 画关键点
for point in points:
if point:
cv2.circle(img, point, 3, (0, 0, 255), -1)
cv2.imshow('Pose Estimation', img)
cv2.waitKey(0)
OCR(Optical Character Recognition,光学字符识别)是将图像中的文字转换为可编辑文本的技术。这是文档数字化、车牌识别、票据处理等应用的核心技术。
传统OCR流程:
现代OCR(基于深度学习):
Tesseract是开源的OCR引擎,OpenCV可以与其配合使用。
import cv2
import pytesseract
# 指定 Tesseract 路径(如果需要)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# 读取图像
img = cv2.imread('text_image.jpg')
# 图像预处理
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 自适应阈值二值化
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
# 去噪
denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
# OCR 识别
text = pytesseract.image_to_string(denoised, lang='chi_sim+eng')
print("识别结果:")
print(text)
# 获取更详细的信息(包含置信度)
data = pytesseract.image_to_data(denoised, output_type=pytesseract.Output.DICT)
# 在图像上绘制识别结果
n_boxes = len(data['text'])
for i in range(n_boxes):
if int(data['conf'][i]) > 60: # 置信度阈值
(x, y, w, h) = (data['left'][i], data['top'][i],
data['width'][i], data['height'][i])
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.putText(img, data['text'][i], (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imshow('OCR Result', img)
cv2.waitKey(0)
import cv2
import numpy as np
# 加载 EAST 文本检测模型
net = cv2.dnn.readNet('frozen_east_text_detection.pb')
img = cv2.imread('text_image.jpg')
orig_h, orig_w = img.shape[:2]
# 预处理
blob = cv2.dnn.blobFromImage(img, 1.0, (320, 320), (123.68, 116.78, 103.94),
swapRB=True, crop=False)
net.setInput(blob)
# 获取输出
score_map, geometry = net.forward(["feature_fusion/concat_3", "feature_fission/add_5"])
# 解析输出
def decode_predictions(scores, geometry, min_confidence=0.5):
rows, cols = scores.shape[2:4]
rects = []
confidences = []
for y in range(rows):
scores_data = scores[0, 0, y]
x0_data = geometry[0, 0, y]
x1_data = geometry[0, 1, y]
x2_data = geometry[0, 2, y]
x3_data = geometry[0, 3, y]
angles_data = geometry[0, 4, y]
for x in range(cols):
conf = scores_data[x]
if conf < min_confidence:
continue
# 计算旋转矩形的四个角
offset_x = x * 4.0
offset_y = y * 4.0
angle = angles_data[x]
cos = np.cos(angle)
sin = np.sin(angle)
h = x0_data[x] + x1_data[x]
w = x2_data[x] + x3_data[x]
end_x = int(offset_x + cos * x2_data[x] + sin * x3_data[x])
end_y = int(offset_y - sin * x2_data[x] + cos * x3_data[x])
start_x = int(end_x - w)
start_y = int(end_y - h)
rects.append((start_x, start_y, end_x, end_y))
confidences.append(conf)
return rects, confidences
rects, confidences = decode_predictions(score_map, geometry)
# 应用非极大值抑制
indices = cv2.dnn.NMSBoxesRects(rects, confidences, 0.5, 0.4)
# 绘制结果
for i in indices:
i = i[0] if isinstance(i, (list, np.ndarray)) else i
start_x, start_y, end_x, end_y = rects[i]
cv2.rectangle(img, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
cv2.imshow('Text Detection', img)
cv2.waitKey(0)
图像分类是为整个图像分配一个类别标签的任务,是计算机视觉中最基础的任务之一。
经典数据集:
import cv2
import numpy as np
# 加载 SqueezeNet 模型(轻量级图像分类模型)
net = cv2.dnn.readNetFromCaffe('squeezenet.prototxt', 'squeezenet.caffemodel')
# 或者使用 MobileNet(更准确,推理更快)
# net = cv2.dnn.readNetFromTensorflow('mobilenet.pb')
# 加载类别标签
with open('synset_words.txt', 'r') as f:
labels = [line.strip() for line in f.readlines()]
# 读取图像
img = cv2.imread('image.jpg')
img = cv2.resize(img, (224, 224))
# 预处理
blob = cv2.dnn.blobFromImage(img, 1.0, (224, 224), (0, 0, 0), swapRB=True)
net.setInput(blob)
# 推理
output = net.forward()
# 获取预测结果
prob = output.flatten()
class_id = np.argmax(prob)
confidence = prob[class_id]
# 输出结果
print(f"预测类别:{labels[class_id]}")
print(f"置信度:{confidence:.4f}")
# 显示前5个最可能的类别
top5_indices = np.argsort(prob)[-5:][::-1]
print("\n前5个最可能的类别:")
for i in top5_indices:
print(f"{labels[i]}: {prob[i]:.4f}")
在深度学习中,数据量越大、越多样化,模型的泛化能力越强。数据增强(Data Augmentation)通过对训练图像进行随机变换,可以人工增加数据的多样性。
常见的数据增强方法:
import cv2
import numpy as np
import random
def random_flip(img):
"""随机水平翻转"""
if random.random() > 0.5:
return cv2.flip(img, 1)
return img
def random_rotation(img, max_angle=15):
"""随机旋转"""
angle = random.uniform(-max_angle, max_angle)
h, w = img.shape[:2]
M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0)
return cv2.warpAffine(img, M, (w, h))
def random_brightness(img, factor=0.3):
"""随机调整亮度"""
brightness = 1 + random.uniform(-factor, factor)
return cv2.convertScaleAbs(img, alpha=brightness, beta=0)
def random_contrast(img, factor=0.3):
"""随机调整对比度"""
contrast = 1 + random.uniform(-factor, factor)
return cv2.convertScaleAbs(img, alpha=contrast, beta=0)
def random_noise(img, noise_level=0.03):
"""添加随机噪声"""
noise = np.random.randn(*img.shape) * noise_level * 255
noisy = img.astype(np.float32) + noise
return np.clip(noisy, 0, 255).astype(np.uint8)
def random_crop(img, scale_range=(0.8, 1.0)):
"""随机裁剪"""
h, w = img.shape[:2]
scale = random.uniform(*scale_range)
new_h, new_w = int(h * scale), int(w * scale)
top = random.randint(0, h - new_h)
left = random.randint(0, w - new_w)
cropped = img[top:top+new_h, left:left+new_w]
return cv2.resize(cropped, (w, h))
def augment_image(img):
"""组合多种增强方法"""
img = random_flip(img)
img = random_rotation(img)
img = random_brightness(img)
img = random_contrast(img)
# 可选:添加噪声或裁剪
return img
# 示例:生成增强数据集
for i in range(100):
img = cv2.imread(f'original/image_{i}.jpg')
for j in range(5): # 每张图生成5个增强版本
augmented = augment_image(img)
cv2.imwrite(f'augmented/image_{i}_aug_{j}.jpg', augmented)
在生产环境中,推理速度非常重要。以下是一些常用的优化技巧:
import cv2
# 1. 使用 GPU 加速
net = cv2.dnn.readNetFromONNX('model.onnx')
# 检查可用的后端
print("可用的后端:", cv2.dnn.getAvailableBackends())
print("可用的目标设备:", cv2.dnn.getAvailableTargets())
# 设置使用 GPU
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
# 2. 使用 Intel OpenVINO(如果安装了)
# net.setPreferableBackend(cv2.dnn.DNN_BACKEND_INFERENCE_ENGINE)
# net.setPreferableTarget(cv2.dnn.DNN_TARGET_MYRIAD)
# 3. 使用 TensorRT(NVIDIA GPU)
# 需要先将模型转换为 TensorRT 格式
# 4. 使用异步推理(并行处理多帧)
net.setNumThreads(4) # 设置线程数
# 5. 批量推理
def batch_inference(net, images, input_size=(416, 416)):
"""批量推理"""
batch_blob = []
for img in images:
blob = cv2.dnn.blobFromImage(img, 1/255.0, input_size, swapRB=True)
batch_blob.append(blob)
# 堆叠为批量输入
batch = np.concatenate(batch_blob, axis=0)
net.setInput(batch)
outputs = net.forward()
return outputs
模型量化将浮点权重转换为低精度整数,可以大幅减少模型体积和加速推理。
# 使用 OpenCV 进行模型量化(需要训练时配合)
# 1. 训练后量化(Post-Training Quantization)
# 2. 量化感知训练(Quantization Aware Training)
# 示例:使用 OpenCV 的 DNN 进行 FP16 推理(半精度)
net = cv2.dnn.readNetFromONNX('model.onnx')
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16) # 使用半精度 GPU 推理
虽然OpenCV的DNN模块可以运行很多模型,但在某些场景下,我们可能需要使用完整的PyTorch或TensorFlow环境。
# 场景1:使用 PyTorch 进行推理,然后用 OpenCV 处理结果
import cv2
import torch
# 在 PyTorch 中推理
model = torch.load('model.pth')
model.eval()
img = cv2.imread('image.jpg')
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
with torch.no_grad():
output = model(img_tensor)
# 使用 OpenCV 可视化结果
result = output.squeeze().numpy()
result = (result * 255).astype(np.uint8)
cv2.imshow('Result', result)
# 场景2:使用 OpenCV 作为预处理/后处理工具
# 结合 PyTorch 模型的完整流程
def hybrid_inference(image_path):
# 1. OpenCV 预处理
img = cv2.imread(image_path)
img = cv2.resize(img, (224, 224))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 转换为 PyTorch 格式
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0
# 2. PyTorch 推理
with torch.no_grad():
output = model(img_tensor)
# 3. OpenCV 后处理和可视化
result = process_output(output)
return result
ONNX(Open Neural Network Exchange)是一个开放的中间格式,可以将不同框架的模型转换为统一格式。
# PyTorch 转 ONNX
import torch
model = MyModel()
model.eval()
# 创建示例输入
dummy_input = torch.randn(1, 3, 224, 224)
# 导出为 ONNX
torch.onnx.export(model,
dummy_input,
"model.onnx",
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'},
'output': {0: 'batch_size'}})
# TensorFlow 转 ONNX
# tf2onnx.convert --model keras_model.h5 --output model.onnx
# OpenCV 加载 ONNX
net = cv2.dnn.readNetFromONNX('model.onnx')
import cv2
import numpy as np
# 初始化各个模型
# 1. 目标检测(人或车辆)
person_net = cv2.dnn.readNetFromDarknet('yolov4.cfg', 'yolov4.weights')
person_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
person_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
# 2. 姿态估计
pose_net = cv2.dnn.readNetFromCaffe('openpose_body25.pbtxt', 'openpose_body25.caffemodel')
# 3. 人脸检测
face_net = cv2.dnn.readNetFromCaffe('deploy.prototxt', 'res10_300x300_ssd_iter_140000.caffemodel')
def process_frame(frame):
h, w = frame.shape[:2]
results = {}
# 人体检测
blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True)
person_net.setInput(blob)
detections = person_net.forward()
# 解析检测结果(简化版)
persons = []
for det in detections[0]:
if det[4] > 0.5: # 置信度阈值
class_id = np.argmax(det[5:])
if class_id in [0, 1, 2]: # person, car, bicycle
x, y, x2, y2 = int(det[0]*w), int(det[1]*h), int(det[2]*w), int(det[3]*h)
persons.append((x, y, x2-x, y2-y))
results['objects'] = persons
# 人脸检测
face_blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300), (104.0, 177.0, 123.0))
face_net.setInput(face_blob)
face_detections = face_net.forward()
faces = []
for i in range(face_detections.shape[2]):
confidence = face_detections[0, 0, i, 2]
if confidence > 0.5:
x1 = int(face_detections[0, 0, i, 3] * w)
y1 = int(face_detections[0, 0, i, 4] * h)
x2 = int(face_detections[0, 0, i, 5] * w)
y2 = int(face_detections[0, 0, i, 6] * h)
faces.append((x1, y1, x2-x1, y2-y1))
results['faces'] = faces
# 绘制结果
for (x, y, w, h) in persons:
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
for (x, y, w, h) in faces:
cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
return frame
# 视频流处理
cap = cv2.VideoCapture('video.mp4')
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
result = process_frame(frame)
cv2.imshow('Smart Surveillance', result)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
| 模型 | 来源 | 用途 |
|---|---|---|
| YOLO | ultralytics/yolov5 | 目标检测 |
| SSD | OpenCV Model Zoo | 目标检测 |
| DeepLab | tensorflow models | 语义分割 |
| OpenPose | CMU Perceptual Computing Lab | 姿态估计 |
| FaceNet | davidsandberg/facenet | 人脸识别 |
| SqueezeNet/MobileNet | OpenCV Model Zoo | 图像分类 |
OpenCV官方提供了大量预训练模型,可以从以下地址获取:
# OpenCV DNN 示例模型
# https://github.com/opencv/opencv/tree/master/samples/dnn
# 下载相应的 .prototxt 和 .caffemodel / .pb 文件