python3.7
pytorch1.1.0
torchvision 0.3.0
cuda 9.0以上
##项目框架
Audio-and-video-demo
将手势视频按帧分解为图片并保存
def ffmpeg_img_extract(videopath):
container = av.open(videopath)
stream = container.streams.video[0]
stream.codec_context.skip_frame = 'NONKEY'
for frame in container.decode(stream):
#savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
frame.to_image().save(savepath,quality=80)
def img_to_video(videopath):
#转换为每帧
container = av.open(videopath)
for frame in container.decode(video=0):
#savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
frame.to_image().save(savepath)
利用训练好的模型对手势图像进行识别,并用label_flag矩阵记录标签。这里使用的是googlenet预训练模型对我们的数据集进行训练,采用学习率降低法多次迭代训练,得到的模型对手势图像识别正确率在95%以上。
def gesture_recognition(filepath):
fileList = os.listdir(filepath)
count = 0
for filename in fileList:
count += 1
#背景音乐标签
bgm_label = []
for i in range(count):
filename = filepath+str(i)+'.jpg'
#图片读取
input_image = Image.open(filename)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#导入测试图片
input_image = Image.open(filename)
preprocess = transforms.Compose([
transforms.Resize(256),
#transforms.CenterCrop(224),
transforms.RandomRotation(20),
#transforms.ColorJitter(contrast=3),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
image_tensor = input_batch.to(device)
#打开labels
with open('images/gesture_24.txt', 'r', encoding='gbk') as clf:
labels = clf.readlines()
#导入训练好的模型
alexnet = torch.load('model/googlenet_model.pkl')
alexnet.eval()
start = time()
with torch.no_grad():
output = alexnet(image_tensor)
prob = F.softmax(output[0], dim=0)
indexs = torch.argsort(-prob)
finish = time()
print("识别时间:")
print(finish-start)
#添加音乐标签
bgm_label.append(labels[indexs[0]].strip())
#对图片做标记
putText(filename,labels[indexs[0]])
return bgm_label
给视频按照手势识别标签制作添加音频针对变换手势音频快速变换问题,采用一个标记矩阵lable_flag记录所有标签中标签变换的位置信息,同时len_flag矩阵存储每一个连续标签存在的时长,对于小于一定连续帧长度的标签做一个容错处理,默认其识别错误,用相应时长空白矩阵进行填充,对于大于30帧的连续帧标记,进行一次语音播报剩下时长用等时长的空白音频填充。
bgm_dict = {'Congratulation':1,
'Eight':2,
'Fist':3,
'Five':4,
'Four':5,
'Heart_1':6,
'Heart_2':7,
'Heart_3':8,
'Heart_single':9,
'Honour':10,
'ILY':11,
'Insult':12,
'Nine':13,
'OK':14,
'One':15,
'Palm_up':16,
'Prayer':17,
'Rock':18,
'Seven':19,
'Six':20,
'Three':21,
'Thumb_down':22,
'Thumb_up':23,
'Two':24 }
def add_bgm3(bgm_label):
count = len(bgm_label)
#print(count)
#标记标签变化位置
label_flag = [0]
label = bgm_label[0]
for i in range(count):
if bgm_label[i]!=label:
label_flag.append(i)
label = bgm_label[i]
label_flag.append(len(bgm_label)-1)
label_flag_number = len(label_flag)
music = AudioSegment.from_wav('bgm/1.wav')
clip = music[:0.0001*1000]
for i in range(label_flag_number-1):
#相应帧数对应标签
flag = label_flag[i]
label = bgm_label[flag]
#标签对应的音频序号
number = bgm_dict[label]
number = int(number)
#print(number)
#去除手势变换识别错误标签
if_or_not = int(label_flag[i+1]-label_flag[i])
if if_or_not < 12:
start = float(label_flag[i]*0.033)
end = float(label_flag[i+1]*0.033)
bgm_len = float(end-start)
#print(bgm_len)
bgm_path = 'bgm/0.wav'
music = AudioSegment.from_wav(bgm_path)
clip = clip + music[:bgm_len*1000]
elif if_or_not > 30:
start = float(label_flag[i]*0.033)
end = float(label_flag[i+1]*0.033)
bgm_len = float(end-start)
#print(bgm_len)
bgm_path = "bgm/%d"%number + ".wav"
music = AudioSegment.from_wav(bgm_path)
clip = clip + music[:30*0.033*1000]
bgm_path = 'bgm/0.wav'
music = AudioSegment.from_wav(bgm_path)
clip = clip + music[:(if_or_not-30)*0.033*1000]
else:
start = float(label_flag[i]*0.033)
end = float(label_flag[i+1]*0.033)
bgm_len = float(end-start)
#print(bgm_len)
bgm_path = "bgm/%d"%number + ".wav"
music = AudioSegment.from_wav(bgm_path)
clip = clip + music[:bgm_len*1000]
clip.export('bgm/clip.wav', format='wav')
将对应音频添加到合成好的视频上
def video_merge2(outpath):
bgm_path = "bgm/clip.wav"
#print(bgm_path)
# 读取音频
audio = AudioFileClip(bgm_path)
video = VideoFileClip('video/saveVideo.mp4')
# 设置视频的音频
video = video.set_audio(audio)
video.write_videofile(outpath)
###combination.py
将识别完并打上标签的手势图片合成为视频
def combination(length):
img = cv2.imread("images/rec_image/0.jpg")
w, h ,c = img.shape
#print(w,h,c)
img_root = "images/rec_image/"
#path=".\\"
filelist=os.listdir()
fps = 30
file_path='video/saveVideo.mp4' # 导出路径DIVX/mp4v
size = (h, w)
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # mp4
videoWriter = cv2.VideoWriter(file_path,fourcc,fps,size)
# 这种情况更适合于照片是从"1.jpg" 开始,然后每张图片名字+1的那种
for i in range(length):
frame = cv2.imread(img_root+str(i)+'.jpg')
videoWriter.write(frame)
videoWriter.release() #释放
对视频分解为帧的图片进行手势识别并贴上标签
def putText(image,label):
print(image)
flag = image.rfind("/")
imagename = image[flag+1:]
imagename = str(imagename)
#savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/rec_image/'+imagename
savepath = 'images/rec_image/'+imagename
print(savepath)
label = label.strip()
#cv2.namedWindow("mark", cv2.WINDOW_AUTOSIZE)
image = cv2.imread(image)
image = cv2.putText(image, label, (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 2)
cv2.imwrite(savepath,image)
###main.py
主函数
def main():
#videopath = 'video/test3.mp4'
videopath = sys.argv[1]
videopath = 'video/'+str(videopath)
outpath = sys.argv[2]
outpath = 'video/'+str(outpath)
#ffmpeg_img_extract(videopath)
img_to_video(videopath)
#输入需要读取图片目录
filepath = 'images/ffmpeg_img/'
#识别图像手势内容并标注保存
bgm_label = gesture_recognition(filepath)
#print(bgm_label)
#图像编码
combination(len(bgm_label))
#添加bgm
add_bgm3(bgm_label)
video_merge2(outpath)
命令行使用
example:
python main.py test.mp4(输入) out.mp4(输出)
giteel链接:https://gitee.com/ceasarxo/gesture-recognition