string = "Hello World!"
print(string)
print("1234")
text_1 = readtext("./nlp/data/textbooks/grade0/text0.txt")
print(text_1)
text = readtext("./nlp/data/textbooks/grade0/text0.txt")
words = splitwords(text)
l1 = [1, 3, 5, 7, 9, 11]
l2 = [1, 'a', ['3.14', 1.5], 'bc']
print(l1[2])
print(l2[3])
l1[1] = 20
print(l1)
l2.append('a')
print(l2)
print(len(l1))
print(len(l2))
for i in l1:
print(i)
word_freq_dict = {'you': 0.098, 'the': 0.020, 'friend': 0.059}
print(word_freq_dict)
print(word_freq_dict['you'])
word_freq_dict['you'] = 0.088
print(word_freq_dict)
word_freq_dict['her'] = 0.0392
print(word_freq_dict)
print('you' in word_freq_dict)
print('he' in word_freq_dict)
for key, value in word_freq_dict.items():
print(key+':'+str(value))
def word_freq(words):
freq_dict = {}
for word in words:
if word in freq_dict:
freq_dict[word] += 1
else:
freq_dict[word] = 1
for word, freq in freq_dict.items():
freq_dict[word] = freq / len(words)
return freq_dict
text = readtext("nlp/data/textbooks/grade0/text0.txt")
words = splitwords(text)
freq_dict = word_freq(words)
print(freq_dict)
textbooks_data = load_textbooks_data()
print(len(textbooks_data))
print(textbooks_data[0:4])
def get_diff_level(path_grade):
diff_level = {}
for path, grade in path_grade:
text = readtext(path)
words = splitwords(text)
grade = int(grade)
for word in words:
if word in diff_level and diff_level[word] <= grade:
continue
else:
diff_level[word] = grade
return diff_level
diff_level = get_diff_level(textbooks_data)
print(diff_level)
save_private(diff_level, "./data/tmp/diff_level")
diff_level = load_private("./data/tmp/diff_level")
print(diff_level)
text = readtext("nlp/data/reading/train/text0.txt")
print(text)
words = splitwords(text)
print(len(words))
grade_freq = [0,0,0,0,0,0,0,0,0,0,0,0]
l1 = [0]*3
l2 = ['a']*5
print(l1)
print(l2)
grade_freq = [0]*12
for word in words:
if word in diff_level:
grade = diff_level[word]
grade_freq[grade] += 1
else:
continue
print(grade_freq)
num = sum(grade_freq)
print(num)
for i in range(12):
grade_freq[i] /= num
print(grade_freq)
def extract_features(path, diff_level):
text = readtext(path)
words = splitwords(text)
grade_freq = [0]*12
for word in words:
if word in diff_level:
grade = diff_level[word]
grade_freq[grade] += 1
else:
continue
num = sum(grade_freq)
for i in range(12):
grade_freq[i] /= num
return grade_freq
grade_freq = extract_features("nlp/data/reading/train/text1.txt", diff_level)
print(grade_freq)
diff_level = load_private("./data/tmp/diff_level")
train_data = load_train_data()
print(len(train_data))
print(train_data[0:5])
test_data = load_test_data()
print(len(test_data))
print(test_data[0:5])
features = []
labels = []
for path, label in train_data:
features.append(extract_features(path, diff_level))
labels.append(int(label))
def get_feats_labels(data, diff_level):
features = []
labels = []
for path, label in data:
features.append(extract_features(path, diff_level))
labels.append(int(label))
return features, labels
train_feats, train_labels = get_feats_labels(train_data, diff_level)
print(train_feats[0:5])
print(train_labels[0:5])
save_private([train_feats, train_labels], "./data/tmp/train_features")
train_feats, train_labels = load_private("./data/tmp/train_features")
print(train_feats[0:5])
print(train_labels[0:5])
test_feats, test_labels = get_feats_labels(test_data, diff_level)
save_private([test_feats, test_labels], "./data/tmp/test_features")
train_data = load_binary_train_data("primary", "junior")
print(len(train_data))
print(train_data[0:5])
test_data = load_binary_test_data("primary", "junior")
test_feats, test_labels = get_feats_labels(test_data, diff_level)
save_private([train_feats, train_labels], "./data/tmp/pri_jun_train_features")
save_private([test_feats, test_labels], "./data/tmp/pri_jun_test_features")
model = linear_classifier()
model.train(train_feats, train_labels)
pred_y = model.pred(test_feats)
acc = accuracy(pred_y, test_labels)
print(acc)
corpus = data.get('corpus')
doc = corpus[87]
fig() + plot(doc)
word_bags = corpus.map(split_words)
stop_words = load_stopwords()
fig() + plot(stop_words)
vocabulary = build_vocabulary(word_bags, stop_words = stop_words, frequency_threshold = 5)
fig() + plot(vocabulary)
print('Vocabulary Length: ', len(vocabulary))
tf = TermFrequency(vocabulary)
tf_features = word_bags.map(tf)
feat = tf_features[87]
fig() + plot(feat)
tfidf = TFIDF(vocabulary, word_bags)
tfidf_features = word_bags.map(tfidf)
feat = tfidf_features[87]
fig() + plot(feat)
corpus, vocab, tf_feat, tfidf_feat = data.get('text-feat')
tfidf_mat = to_matrix(tfidf_feat)
print("文档-词矩阵尺寸:",tfidf_mat.shape)
model = topic_model(vocab, tfidf_mat, num_topics=8)
model.train()
t_mat = model.tmatrix
w_mat = model.wmatrix
print('Size of T Matrix: ', t_mat.shape)
print('Size of W Matrix: ', w_mat.shape)
high_freqs = model.extract_highfreqs(top_n=5)
fig() + plot(high_freqs)
参考图像:
id = 87
doc = corpus[id]
fig() + plot(doc)
topic_weights = w_mat[id]
fig(2, 1) + [plot(high_freqs), plot(topic_weights)]