首先可以先看下小强机器人上个版本:
自己制作智能语音机器人(基于jetson nano)自己制作智能语音机器人(基于jetson nano)_智能语音机器人开发_AI强仔的博客-CSDN博客
这个在上面版本基础上增加了图像显示系统、图像识别、手势识别、图像生成。所以硬件增加了显示屏、摄像头。
B站视频:【给小强机器人增加了眼睛(深度相机),能看能听能说能画了-哔哩哔哩】 给小强机器人增加了眼睛(深度相机),能看能听能说能画了_哔哩哔哩_bilibili
#!/usr/bin/env python3
# -*-coding: utf-8 -*-
"""
"""
import time
import cv2 as cv
import numpy as np
class CameraApp:
def __init__(self):
######################### Detection ##########################
# load the COCO class names
with open('camera/object_detection_coco.txt', 'r',encoding="utf-8") as f:
self.class_names = f.read().split('\n')
with open('camera/coco.names', 'rt') as f:
self.names = f.read().rstrip('\n').split('\n')
# get a different color array for each of the classes,np.random.uniform()作用于从一个均匀分布的区域中随机采样,返回对象:ndarray类型,形状和size中的数值一样
self.COLORS = np.random.uniform(0, 255, size=(len(self.class_names), 3))
# load the DNN modelimage
#self.model = cv.dnn.readNet(model='camera/frozen_inference_graph.pb', config='camera/ssd_mobilenet_v2_coco.txt', framework='TensorFlow')
######################### openpose ##########################
self.BODY_PARTS = {"Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
"LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
"RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
"LEye": 15, "REar": 16, "LEar": 17, "Background": 18}
self.POSE_PAIRS = [["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"]]
self.HAND_POSE_PAIRS = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], [10, 11],
[11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
#self.net = cv.dnn.readNetFromTensorflow("camera/graph_opt.pb")
self.hand_net=cv.dnn.readNetFromCaffe('camera/pose_deploy.prototxt','camera/pose_iter_102000.caffemodel')
self.yolo4_net= cv.dnn_DetectionModel('camera/yolov4-tiny.cfg', 'camera/yolov4-tiny.weights')
self.yolo4_net.setInputSize(320, 320)
self.yolo4_net.setInputScale(1.0 / 255)
self.yolo4_net.setInputSwapRB(True)
def YOLO_Target_Detection(self,frame):
# if image is None:
# time.sleep(0.5)
# return None
# image_height, image_width, _ = image.shape
# # blobFromImage主要是用来对图片进行预处理,1.整体像素值减去平均值(mean),2.通过缩放系数(scalefactor)对图片像素值进行缩放
# blob = cv.dnn.blobFromImage(image=image, size=(320, 320), mean=(0, 0, 0), swapRB=True)
# self.yolo4_net.setInput(blob)
classes, confidences, boxes = self.yolo4_net.detect(frame, confThreshold=0.1, nmsThreshold=0.4)
#output = self.model.forward()
# loop over each of the detections
if len(classes)>0:
for classId, confidence, box in zip(classes.flatten(), confidences.flatten(), boxes):
if confidence > 0.5:
left, top, w, h = box
cv.rectangle(frame, (left, top), (left + w, top + h), (0, 255, 0), 2)
cv.putText(frame, self.names[classId], (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
return frame
def Target_Detection(self,image):
# if image is None:
# time.sleep(0.5)
# return None
image_height, image_width, _ = image.shape
# blobFromImage主要是用来对图片进行预处理,1.整体像素值减去平均值(mean),2.通过缩放系数(scalefactor)对图片像素值进行缩放
blob = cv.dnn.blobFromImage(image=image, size=(300, 300), mean=(104, 117, 123), swapRB=True)
self.model.setInput(blob)
output = self.model.forward()
# loop over each of the detections
for detection in output[0, 0, :, :]:
# extract the confidence of the detection
confidence = detection[2]
# draw bounding boxes only if the detection confidence is above...
# ... a certain threshold, else skip
if confidence > .4:
# get the class id
class_id = detection[1]
# map the class id to the class
class_name = self.class_names[int(class_id) - 1]
color = self.COLORS[int(class_id)]
# get the bounding box coordinates
box_x = detection[3] * image_width
box_y = detection[4] * image_height
# get the bounding box width and height
box_width = detection[5] * image_width
box_height = detection[6] * image_height
# draw a rectangle around each detected object
cv.rectangle(image, (int(box_x), int(box_y)), (int(box_width), int(box_height)), color, thickness=2)
# put the class name text on the detected object
cv.putText(image, class_name, (int(box_x), int(box_y - 5)), cv.FONT_HERSHEY_SIMPLEX, 1, color, 2)
return image
def openpose_hand(self, frame):
nPoints = 22
frameHeight, frameWidth = frame.shape[:2]
aspect_ratio = frameWidth / frameHeight
# self.hand_net.setInput(
# cv.dnn.blobFromImage(frame, 1.0, (368, 368), (127.5, 127.5, 127.5), swapRB=True, crop=False))
#
inWidth = int(((aspect_ratio * 368) * 8) // 8)
inpBlob = cv.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, 368), (0, 0, 0), swapRB=False,crop=False)
self.hand_net.setInput(inpBlob)
out = self.hand_net.forward()
# out = out[:, :19, :, :] # MobileNet output [1, 57, -1, -1], we only need the first 19 elements
points = []
for i in range(nPoints):
# confidence map of corresponding body's part.
probMap = out[0, i, :, :]
probMap = cv.resize(probMap, (frameWidth, frameHeight))
# Find global maxima of the probMap.
minVal, prob, minLoc, point = cv.minMaxLoc(probMap)
if prob > 0.1:
cv.circle(frame, (int(point[0]), int(point[1])), 8, (0, 255, 255), thickness=-1,
lineType=cv.FILLED)
cv.putText(frame, "{}".format(i), (int(point[0]), int(point[1])), cv.FONT_HERSHEY_SIMPLEX, 1,
(0, 0, 255), 2, lineType=cv.LINE_AA)
# Add the point to the list if the probability is greater than the threshold
points.append((int(point[0]), int(point[1])))
else:
points.append(None)
# cv.imshow('Output-Keypoints', frame)
for pair in self.HAND_POSE_PAIRS:
partA = pair[0]
partB = pair[1]
if points[partA] and points[partB]:
cv.line(frame, points[partA], points[partB], (0, 255, 255), 2)
cv.circle(frame, points[partA], 8, (0, 0, 255), thickness=-1, lineType=cv.FILLED)
# cv2.imshow('Output-Skeleton', frame)
return frame
def openpose(self,frame):
frameHeight, frameWidth = frame.shape[:2]
self.net.setInput(cv.dnn.blobFromImage(frame, 1.0, (368, 368), (127.5, 127.5, 127.5), swapRB=True, crop=False))
out = self.net.forward()
out = out[:, :19, :, :] # MobileNet output [1, 57, -1, -1], we only need the first 19 elements
assert (len(self.BODY_PARTS) == out.shape[1])
points = []
for i in range(len(self.BODY_PARTS)):
# Slice heatmap of corresponging body's part.
heatMap = out[0, i, :, :]
# Originally, we try to find all the local maximums. To simplify a sample
# we just find a global one. However only a single pose at the same time
# could be detected this way.函数cv::minMaxLoc找出最大和最小元素的值和他们的位置.minVal, maxVal, minLoc, maxLoc=cv.minMaxLoc(src[, mask])
_, conf, _, point = cv.minMaxLoc(heatMap)
x = (frameWidth * point[0]) / out.shape[3]
y = (frameHeight * point[1]) / out.shape[2]
# Add a point if it's confidence is higher than threshold.
points.append((int(x), int(y)) if conf > 0.2 else None)
for pair in self.POSE_PAIRS:
partFrom = pair[0]
partTo = pair[1]
assert (partFrom in self.BODY_PARTS)
assert (partTo in self.BODY_PARTS)
idFrom = self.BODY_PARTS[partFrom]
idTo = self.BODY_PARTS[partTo]
if points[idFrom] and points[idTo]:
cv.line(frame, points[idFrom], points[idTo], (0, 255, 0), 3)
cv.ellipse(frame, points[idFrom], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)#ellipse()函数可以画扇形,也可以画椭圆
cv.ellipse(frame, points[idTo], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)
return frame
if __name__ == '__main__':
capture = cv.VideoCapture(0)
cv_edition = cv.__version__
#CAP_PROP_FOURCC:4个字符表示的视频编码器格式
#Xvid是一个开放源代码的MPEG-4视频编解码器
if cv_edition[0] == '3': capture.set(cv.CAP_PROP_FOURCC, cv.VideoWriter_fourcc(*'XVID'))
else: capture.set(cv.CAP_PROP_FOURCC, cv.VideoWriter.fourcc('M', 'J', 'P', 'G'))
capture.set(cv.CAP_PROP_FRAME_WIDTH, 640)
capture.set(cv.CAP_PROP_FRAME_HEIGHT, 480)
c_app=CameraApp()
state=True
while capture.isOpened():
start = time.time()
ret, frame = capture.read()
action = cv.waitKey(10) & 0xFF#waitKey其中参数k单位是毫秒.返回值为k毫秒内键盘按键的ASCII码值
if state==True: frame = c_app.Target_Detection(frame)
else: frame = c_app.openpose(frame)
end = time.time()
fps = 1 / (end - start)
text = "FPS : " + str(int(fps))
#FONT_HERSHEY_SIMPLEX:正常大小无衬线字体
cv.putText(frame, text, (20, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (100, 200, 200), 1)
cv.imshow('frame', frame)
# 将OpenCV帧转换为PIL图像
#image = Image.fromarray(cv.cvtColor(frame, cv.COLOR_BGR2RGB))
# 将PIL图像转换为Tkinter图像
#photo = ImageTk.PhotoImage(image=image)
if action == ord('q') or action == 113: break
if action == ord('f'):state = not state
capture.release()
cv.destroyAllWindows()