给语音打上标签

# -*- coding: utf-8 -*-
"""
Created on Sat Apr 28 20:14:38 2018

@author: yanghe
"""

import numpy as np
import tensorflow as tf
import pygame  
from  voice_tool import *

pygame.mixer.init() 
track4=pygame.mixer.Sound("./raw_data/dev/2.wav")
track4.play()


X = np.load("./XY_train/X.npy")
Y = np.load("./XY_train/Y.npy")
Y = np.reshape(Y, (-1, 1375))
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")
Y_dev = np.reshape(Y_dev, (-1, 1375))
def get_weight_l2(shape ,name, regulariztion_rate=None):
    weight = tf.get_variable('weight_%s'%str(name), shape=shape, initializer=tf.truncated_normal_initializer(stddev=0.1) )
    bias =  tf.get_variable('bias_%s'%str(name), shape=[shape[-1]], initializer=tf.constant_initializer(0.1) )
    if regulariztion_rate != None  :
        tf.add_to_collection('loss' ,tf.contrib.layers.l2_regularizer(regulariztion_rate)(weight))
    return bias , weight

def model(inputs):
    inputs = tf.reshape(inputs, (-1, 5511, 1, 101))
    with tf.variable_scope('conv1'):
        conv1_biases,conv1_weight = get_weight_l2(shape=[15, 1, 101, 128],name='layer1')
        conv1 = tf.nn.conv2d(inputs, conv1_weight, strides=[1,1,1,1], padding='SAME') 
        relu1 = tf.nn.relu(tf.nn.bias_add(conv1 , conv1_biases))
        pool1 = tf.nn.max_pool(relu1, ksize=[1,15,1,1], strides=[1,4,1,1], padding='VALID')
        dorp1 = tf.nn.dropout(pool1,keep_prob )
        dorp1 = tf.reshape(dorp1, (-1, 1375, 128))
        dorp1 = tf.transpose(dorp1, [1, 0, 2])
    with tf.variable_scope('rnn'):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(128, forget_bias=1.0)
        lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * 2)
        outputs, _ = tf.nn.dynamic_rnn(cell, dorp1, dtype=tf.float32)
        outputs = tf.reshape(outputs, (-1, 1375, 128, 1))
    with tf.variable_scope('conv2'):
        conv2_biases,conv2_weight = get_weight_l2(shape=[1, 128, 1, 1],name='layer2')
        conv2 = tf.nn.conv2d(outputs, conv2_weight, strides=[1,1,1,1], padding='SAME') 
        sig1 = tf.nn.sigmoid(tf.nn.bias_add(conv2 , conv2_biases))
        sig1 = tf.reshape(sig1, (-1, 1375))
    return sig1

def train():
    global_step = tf.Variable(0, trainable=False)
    with tf.variable_scope("moving_average"):
        variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())
    with tf.variable_scope('voice') as scope:
        pred = model(input_data)
    with tf.variable_scope("train_step"):
        learning_rate = tf.train.exponential_decay(
                        learning_rate_base,
                        global_step,
                        1,
                        learing_rate_decay,
                        staircase=True)
    cross_entropy = tf.reduce_mean(-tf.reduce_sum( targets* tf.log(pred) ,reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy, global_step=global_step)
    
    with tf.control_dependencies([train_step , variables_averages_op]):
        train_op = tf.no_op(name='train')
    
    saver=tf.train.Saver()
    with tf.Session() as sess :
        tf.global_variables_initializer().run()
        for i in range(training_steps):
            _= sess.run(train_op , feed_dict={input_data:X , targets:Y,keep_prob:0.8})
            if i % 2 == 0 :
                loss = sess.run(cross_entropy , feed_dict={input_data:X_dev , targets:Y_dev,keep_prob:1.0})
                print("After %d training step(s), the model loss is %g " % (i, loss))
        saver.save(sess , 'saver/moedl_voce_3.ckpt')
            
def predict():
    filename  = "./raw_data/dev/2.wav"
    x = graph_spectrogram(filename)
    x  = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    with tf.variable_scope('voice') as scope:
        scope.reuse_variables()
        prediction = model(input_data)
    saver=tf.train.Saver()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        saver.restore(sess,'saver/moedl_voce_1.ckpt')
        predict_= sess.run(prediction,feed_dict={input_data:x,keep_prob:1.0})
        print(predict_)
        chime_on_activate(filename, predict_, 0.5)
        track4=pygame.mixer.Sound("./chime_output.wav")
        track4.play()
        
keep_prob = tf.placeholder(tf.float32)
input_data = tf.placeholder(tf.float32, [None,  5511, 101])
targets = tf.placeholder(tf.float32,[None, 1375])
training_steps = 50 
learning_rate_base = 0.01
learing_rate_decay = 0.99
moving_average_decay = 0.99

train()
#predict()


filename  = "./raw_data/dev/2.wav"
x = graph_spectrogram(filename)
x  = x.swapaxes(0,1)
x = np.expand_dims(x, axis=0)
with tf.variable_scope('voice') as scope:
    scope.reuse_variables()
    prediction = model(input_data)
saver=tf.train.Saver()
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess , 'saver/moedl_voce_3.ckpt')
    predict_= sess.run(prediction,feed_dict={input_data:x,keep_prob:1.0})
    chime_on_activate(filename, predict_,  0.9995656)
    track4=pygame.mixer.Sound("./chime_output.wav")
    track4.play()

你可能感兴趣的:(给语音打上标签)