import random
import time
import os
import datetime
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
class TrainConfig(object):
epochs = 10
decay_rate = 0.92
learning_rate = 0.01
evaluate_every = 100
checkpoint_every = 100
max_grad_norm = 3.0
class ModelConfig(object):
hidden_layers = [200]
dropout_keep_prob = 0.6
class Config(object):
batch_size = 32
num_skills = 124
input_size = num_skills * 2
trainConfig = TrainConfig()
modelConfig = ModelConfig()
config = Config()
class DataGenerator(object):
def __init__(self, fileName, config):
self.fileName = fileName
self.train_seqs = []
self.test_seqs = []
self.infer_seqs = []
self.batch_size = config.batch_size
self.pos = 0
self.end = False
self.num_skills = config.num_skills
self.skills_to_int = {}
self.int_to_skills = {}
def read_file(self):
seqs_by_student = {}
skills = []
count = 0
with open(self.fileName, 'r') as f:
for line in f:
fields = line.strip().split(" ")
student, skill, is_correct = int(fields[0]), int(fields[1]), int(fields[2])
skills.append(skill)
seqs_by_student[student] = seqs_by_student.get(student, []) + [[skill, is_correct]]
return seqs_by_student, list(set(skills))
def gen_dict(self, unique_skills):
sorted_skills = sorted(unique_skills)
skills_to_int = {}
int_to_skills = {}
for i in range(len(sorted_skills)):
skills_to_int[sorted_skills[i]] = i
int_to_skills[i] = sorted_skills[i]
self.skills_to_int = skills_to_int
self.int_to_skills = int_to_skills
def split_dataset(self, seqs_by_student, sample_rate=0.2, random_seed=1):
sorted_keys = sorted(seqs_by_student.keys())
random.seed(random_seed)
test_keys = set(random.sample(sorted_keys, int(len(sorted_keys) * sample_rate)))
test_seqs = [seqs_by_student[k] for k in seqs_by_student if k in test_keys]
train_seqs = [seqs_by_student[k] for k in seqs_by_student if k not in test_keys]
return train_seqs, test_seqs
def gen_attr(self, is_infer=False):
if is_infer:
seqs_by_students, skills = self.read_file()
self.infer_seqs = seqs_by_students
else:
seqs_by_students, skills = self.read_file()
train_seqs, test_seqs = self.split_dataset(seqs_by_students)
self.train_seqs = train_seqs
self.test_seqs = test_seqs
self.gen_dict(skills)
def pad_sequences(self, sequences, maxlen=None, value=0.):
lengths = [len(s) for s in sequences]
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
x = (np.ones((nb_samples, maxlen)) * value).astype(np.int32)
for idx, s in enumerate(sequences):
trunc = np.asarray(s, dtype=np.int32)
x[idx, :len(trunc)] = trunc
return x
def num_to_one_hot(self, num, dim):
base = np.zeros(dim)
if num >= 0:
base[num] += 1
return base
def format_data(self, seqs):
seq_len = np.array(list(map(lambda seq: len(seq) - 1, seqs)))
max_len = max(seq_len)
x_sequences = np.array([[(self.skills_to_int[j[0]] + self.num_skills * j[1]) for j in i[:-1]] for i in seqs])
x = self.pad_sequences(x_sequences, maxlen=max_len, value=-1)
input_x = np.array([[self.num_to_one_hot(j, self.num_skills * 2) for j in i] for i in x])
target_id_seqs = np.array([[self.skills_to_int[j[0]] for j in i[1:]] for i in seqs])
target_id = self.pad_sequences(target_id_seqs, maxlen=max_len, value=0)
target_correctness_seqs = np.array([[j[1] for j in i[1:]] for i in seqs])
target_correctness = self.pad_sequences(target_correctness_seqs, maxlen=max_len, value=0)
return dict(input_x=input_x, target_id=target_id, target_correctness=target_correctness,
seq_len=seq_len, max_len=max_len)
def next_batch(self, seqs):
length = len(seqs)
num_batchs = length // self.batch_size
start = 0
for i in range(num_batchs):
batch_seqs = seqs[start: start + self.batch_size]
start += self.batch_size
params = self.format_data(batch_seqs)
yield params
class TensorFlowDKT(object):
def __init__(self, config):
self.hiddens = hiddens = config.modelConfig.hidden_layers
self.num_skills = num_skills = config.num_skills
self.input_size = input_size = config.input_size
self.batch_size = batch_size = config.batch_size
self.keep_prob_value = config.modelConfig.dropout_keep_prob
self.max_steps = tf.placeholder(tf.int32, name="max_steps")
self.input_data = tf.placeholder(tf.float32, [batch_size, None, input_size], name="input_x")
self.sequence_len = tf.placeholder(tf.int32, [batch_size], name="sequence_len")
self.keep_prob = tf.placeholder(tf.float32, name="keep_prob")
self.target_id = tf.placeholder(tf.int32, [batch_size, None], name="target_id")
self.target_correctness = tf.placeholder(tf.float32, [batch_size, None], name="target_correctness")
self.flat_target_correctness = None
hidden_layers = []
for idx, hidden_size in enumerate(hiddens):
lstm_layer = tf.nn.rnn_cell.LSTMCell(num_units=hidden_size, state_is_tuple=True)
hidden_layer = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_layer, output_keep_prob=self.keep_prob)
hidden_layers.append(hidden_layer)
self.hidden_cell = tf.nn.rnn_cell.MultiRNNCell(cells=hidden_layers, state_is_tuple=True)
outputs, self.current_state = tf.nn.dynamic_rnn(cell=self.hidden_cell,
inputs=self.input_data,
sequence_length=self.sequence_len,
dtype=tf.float32)
output_w = tf.get_variable("W", [hiddens[-1], num_skills])
output_b = tf.get_variable("b", [num_skills])
self.output = tf.reshape(outputs, [batch_size * self.max_steps, hiddens[-1]])
self.logits = tf.matmul(self.output, output_w) + output_b
self.mat_logits = tf.reshape(self.logits, [batch_size, self.max_steps, num_skills])
self.pred_all = tf.sigmoid(self.mat_logits, name="pred_all")
flat_target_correctness = tf.reshape(self.target_correctness, [-1])
self.flat_target_correctness = flat_target_correctness
flat_base_target_index = tf.range(batch_size * self.max_steps) * num_skills
flat_base_target_id = tf.reshape(self.target_id, [-1])
flat_target_id = flat_base_target_id + flat_base_target_index
flat_logits = tf.reshape(self.logits, [-1])
flat_target_logits = tf.gather(flat_logits, flat_target_id)
self.pred = tf.sigmoid(tf.reshape(flat_target_logits, [batch_size, self.max_steps]), name="pred")
self.binary_pred = tf.cast(tf.greater_equal(self.pred, 0.5), tf.float32, name="binary_pred")
with tf.name_scope("loss"):
self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=flat_target_correctness, logits=flat_target_logits))
def mean(item):
return sum(item) / len(item)
def gen_metrics(sequence_len, binary_pred, pred, target_correctness):
binary_preds = []
preds = []
target_correctnesses = []
for seq_idx, seq_len in enumerate(sequence_len):
binary_preds.append(binary_pred[seq_idx, :seq_len])
preds.append(pred[seq_idx, :seq_len])
target_correctnesses.append(target_correctness[seq_idx, :seq_len])
new_binary_pred = np.concatenate(binary_preds)
new_pred = np.concatenate(preds)
new_target_correctness = np.concatenate(target_correctnesses)
auc = roc_auc_score(new_target_correctness, new_pred)
accuracy = accuracy_score(new_target_correctness, new_binary_pred)
precision = precision_score(new_target_correctness, new_binary_pred)
recall = recall_score(new_target_correctness, new_binary_pred)
return auc, accuracy, precision, recall
class DKTEngine(object):
def __init__(self):
self.config = Config()
self.train_dkt = None
self.test_dkt = None
self.sess = None
self.global_step = 0
def add_gradient_noise(self, grad, stddev=1e-3, name=None):
"""
Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
"""
with tf.op_scope([grad, stddev], name, "add_gradient_noise") as name:
grad = tf.convert_to_tensor(grad, name="grad")
gn = tf.random_normal(tf.shape(grad), stddev=stddev)
return tf.add(grad, gn, name=name)
def train_step(self, params, train_op, train_summary_op, train_summary_writer):
"""
A single training step
"""
dkt = self.train_dkt
sess = self.sess
global_step = self.global_step
feed_dict = {dkt.input_data: params['input_x'],
dkt.target_id: params['target_id'],
dkt.target_correctness: params['target_correctness'],
dkt.max_steps: params['max_len'],
dkt.sequence_len: params['seq_len'],
dkt.keep_prob: self.config.modelConfig.dropout_keep_prob}
_, step, summaries, loss, binary_pred, pred, target_correctness = sess.run(
[train_op, global_step, train_summary_op, dkt.loss, dkt.binary_pred, dkt.pred, dkt.target_correctness],
feed_dict)
auc, accuracy, precision, recall = gen_metrics(params['seq_len'], binary_pred, pred, target_correctness)
time_str = datetime.datetime.now().isoformat()
print("train: {}: step {}, loss {}, acc {}, auc: {}, precision: {}, recall: {}".format(time_str, step, loss, accuracy,
auc, precision, recall))
train_summary_writer.add_summary(summaries, step)
def dev_step(self, params, dev_summary_op, writer=None):
"""
Evaluates model on a dev set
"""
dkt = self.test_dkt
sess = self.sess
global_step = self.global_step
feed_dict = {dkt.input_data: params['input_x'],
dkt.target_id: params['target_id'],
dkt.target_correctness: params['target_correctness'],
dkt.max_steps: params['max_len'],
dkt.sequence_len: params['seq_len'],
dkt.keep_prob: 1.0}
step, summaries, loss, pred, binary_pred, target_correctness = sess.run(
[global_step, dev_summary_op, dkt.loss, dkt.pred, dkt.binary_pred, dkt.target_correctness],
feed_dict)
auc, accuracy, precision, recall = gen_metrics(params['seq_len'], binary_pred, pred, target_correctness)
if writer:
writer.add_summary(summaries, step)
return loss, accuracy, auc, precision, recall
def run_epoch(self, fileName):
config = Config()
dataGen = DataGenerator(fileName, config)
dataGen.gen_attr()
train_seqs = dataGen.train_seqs
test_seqs = dataGen.test_seqs
session_conf = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False
)
sess = tf.Session(config=session_conf)
self.sess = sess
with sess.as_default():
with tf.name_scope("train"):
with tf.variable_scope("dkt", reuse=None):
train_dkt = TensorFlowDKT(config)
with tf.name_scope("test"):
with tf.variable_scope("dkt", reuse=True):
test_dkt = TensorFlowDKT(config)
self.train_dkt = train_dkt
self.test_dkt = test_dkt
global_step = tf.Variable(0, name="global_step", trainable=False)
self.global_step = global_step
optimizer = tf.train.AdamOptimizer(config.trainConfig.learning_rate)
grads_and_vars = optimizer.compute_gradients(train_dkt.loss)
grads_and_vars = [(tf.clip_by_norm(g, config.trainConfig.max_grad_norm), v)
for g, v in grads_and_vars if g is not None]
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train_op")
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("writing to {}".format(out_dir))
train_loss_summary = tf.summary.scalar("loss", train_dkt.loss)
train_summary_op = tf.summary.merge([train_loss_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
test_loss_summary = tf.summary.scalar("loss", test_dkt.loss)
dev_summary_op = tf.summary.merge([test_loss_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
saver = tf.train.Saver(tf.global_variables())
sess.run(tf.global_variables_initializer())
print("初始化完毕,开始训练")
for i in range(config.trainConfig.epochs):
np.random.shuffle(train_seqs)
for params in dataGen.next_batch(train_seqs):
"""
print("input_x:", params['input_x'].shape)
print("target_id:", params['target_id'].shape)
print("target_correctness:", params['target_correctness'].shape)
print("seq_len:", params['seq_len'])
print("max_len:", params['max_len'])
"""
self.train_step(params, train_op, train_summary_op, train_summary_writer)
current_step = tf.train.global_step(sess, global_step)
if current_step % config.trainConfig.evaluate_every == 0:
print("\nEvaluation:")
losses = []
accuracys = []
aucs = []
precisions = []
recalls = []
for params in dataGen.next_batch(test_seqs):
loss, accuracy, auc, precision, recall = self.dev_step(params, dev_summary_op, writer=None)
losses.append(loss)
accuracys.append(accuracy)
aucs.append(auc)
precisions.append(precision)
recalls.append(recall)
time_str = datetime.datetime.now().isoformat()
print("dev: {}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".
format(time_str, current_step, mean(losses), mean(accuracys), mean(aucs), mean(precisions), mean(recalls)))
if current_step % config.trainConfig.checkpoint_every == 0:
path = saver.save(sess, "model/my-model", global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
if __name__ == "__main__":
fileName = "./data/assistments.txt"
dktEngine = DKTEngine()
dktEngine.run_epoch(fileName)