1. 读取数据集
2. 训练集与测试集划分
3. 线性回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。
4. 多项式回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。
5. 比较线性模型与非线性模型的性能,并说明原因。
# 多元线性回归模型 from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split # 波士顿房价数据集 data = load_boston() # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(data.data,data.target,test_size=0.3) # 建立线性回归模型 from sklearn.linear_model import LinearRegression bos_lg = LinearRegression() bos_lg.fit(x_train,y_train) print('系数',bos_lg.coef_,"\n截距",bos_lg.intercept_) # 检测模型好坏 from sklearn.metrics import regression y_predict = bos_lg.predict(x_test) # 计算模型的预测指标 print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict)) print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict)) # 打印模型的分数 print("模型的分数:",bos_lg.score(x_test, y_test)) print('=================\n') # 多元多项式回归模型 # 多项式化 from sklearn.preprocessing import PolynomialFeatures poly2 = PolynomialFeatures(degree=2) x_poly_train = poly2.fit_transform(x_train) x_poly_test = poly2.transform(x_test) # 建立模型 bos_lgp = LinearRegression() bos_lgp.fit(x_poly_train, y_train) # 预测 y_predict2 = bos_lgp.predict(x_poly_test) # 检测模型好坏 # 计算模型的预测指标 print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict2)) print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict2)) # 打印模型的分数 print("模型的分数:",bos_lgp.score(x_poly_test, y_test))
二、中文文本分类
按学号未位下载相应数据集。
147:财经、彩票、房产、股票、
258:家居、教育、科技、社会、时尚、
0369:时政、体育、星座、游戏、娱乐
分别建立中文文本分类模型,实现对文本的分类。基本步骤如下:
1.各种获取文件,写文件
2.除去噪声,如:格式转换,去掉符号,整体规范化
3.遍历每个个文件夹下的每个文本文件。
4.使用jieba分词将中文文本切割。
中文分词就是将一句话拆分为各个词语,因为中文分词在不同的语境中歧义较大,所以分词极其重要。
可以用jieba.add_word('word')增加词,用jieba.load_userdict('wordDict.txt')导入词库。
维护自定义词库
5.去掉停用词。
维护停用词表
6.对处理之后的文本开始用TF-IDF算法进行单词权值的计算
7.贝叶斯预测种类
8.模型评价
9.新文本类别预测
模型
import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LinearRegression from myThread import my_main import collections import matplotlib.pyplot as plt from pylab import mpl mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体 '''def get_data(): data = [] stopword = get_stopword() label = [] for i in range(644580,644602):#股票 file = "d:/data/147/temp/股票/"+str(i)+".txt" with open(file,'r',encoding='utf-8') as f: news = f.read() this_news = "" for ci in jieba.cut(news): if ci not in stopword: this_news = this_news+ci+" " data.append(this_news) label.append('股票') for i in range(264410,264429):#房产 file = "d:/data/147/temp/房产/" + str(i) + ".txt" with open(file, 'r', encoding='utf-8') as f: news = f.read() this_news = "" for ci in jieba.cut(news): if ci not in stopword: this_news = this_news + ci + " " data.append(this_news) label.append('房产') for i in range(256822,256843):#彩票 file = "d:/data/147/temp/彩票/" + str(i) + ".txt" with open(file, 'r', encoding='utf-8') as f: news = f.read() this_news = "" for ci in jieba.cut(news): if ci not in stopword: this_news = this_news + ci + " " data.append(this_news) label.append('彩票') for i in range(798977,798999):#财经 file = "d:/data/147/temp/财经/" + str(i) + ".txt" with open(file, 'r', encoding='utf-8') as f: news = f.read() this_news = "" for ci in jieba.cut(news): if ci not in stopword: this_news = this_news + ci + " " data.append(this_news) label.append('财经') return data,label def get_stopword(): #加载停用词表 stopwords = [line.strip() for line in open('stopword.txt', 'r',encoding='utf-8').readlines()] stopwords.append('\u3000') stopwords.append('\n') return stopwords ''' def xiangliang(x_train,x_test): # 向量化 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), strip_accents='unicode') # ,norm='12' x_train = vectorizer.fit_transform(x_train) x_test = vectorizer.transform(x_test) return x_train, x_test, vectorizer def beiNB(x_train, y_train,x_test): # 朴素贝叶斯分类器 clf = MultinomialNB().fit(x_train, y_train) y_nb_pred = clf.predict(x_test) return y_nb_pred,clf def result(vectorizer,clf): # 分类结果 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print('====================shape') print(y_nb_pred.shape, y_nb_pred) print('nb_confusion_matrix:') cm = confusion_matrix(y_test, y_nb_pred) print(cm) cr = classification_report(y_test, y_nb_pred) print(cr) feature_names = vectorizer.get_feature_names() coefs = clf.coef_ intercept = clf.intercept_ coefs_with_fns = sorted(zip(coefs[0], feature_names)) n = 10 top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) print('=================coef') for (coef_1, fn_1), (coef_2, fn_2) in top: print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) if __name__ == '__main__': data,label = my_main() print(len(data)) print(label,len(label)) x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0,stratify=label) X_train, X_test, vectorizer = xiangliang(x_train, x_test) y_nb_pred, clf = beiNB(X_train, y_train, X_test) result(vectorizer, clf) plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # 统计测试集和预测集的各类新闻个数 testCount = collections.Counter(y_test) predCount = collections.Counter(y_nb_pred) print('实际:', testCount, '\n', '预测', predCount) # 建立标签列表,实际结果列表,预测结果列表, nameList = list(testCount.keys()) testList = list(testCount.values()) predictList = list(predCount.values()) x = list(range(len(nameList))) print("新闻类别:", nameList, '\n', "实际:", testList, '\n', "预测:", predictList) # 画图 plt.figure(figsize=(7, 5)) total_width, n = 0.6, 2 width = total_width / n plt.bar(x, testList, width=width, label='实际', fc='g') for i in range(len(x)): x[i] = x[i] + width plt.bar(x, predictList, width=width, label='预测', tick_label=nameList, fc='b') plt.grid() plt.title('实际和预测对比图', fontsize=17) plt.xlabel('新闻类别', fontsize=17) plt.ylabel('频数', fontsize=17) plt.legend(fontsize=17) plt.tick_params(labelsize=15) plt.show()
mythread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
#from nlt_cut import get_stopword,xiangliang,beiNB,result
from
sklearn.model_selection
import
train_test_split
import
numpy as np
class
myThread(threading.Thread):
'''读取文件的线程类'''
def
__init__(
self
,threadID,name,start_number,end_number):
threading.Thread.__init__(
self
)
self
.threadID
=
threadID
self
.name
=
name
self
.start_number
=
start_number
self
.end_number
=
end_number
def
run(
self
):
print
(
'读取文件开始:'
+
self
.name)
read_txt(
self
.name,
self
.start_number,
self
.end_number)
print
(
'读取文件结束'
+
self
.name)
def
get_stopword():
'''加载停用词表'''
stopwords
=
[line.strip()
for
line
in
open
(
'stopword.txt'
,
'r'
,encoding
=
'utf-8'
).readlines()]
stopwords.append(
'\u3000'
)
stopwords.append(
'\n'
)
return
stopwords
data
=
[]
label
=
[]
stopword
=
get_stopword()
def
read_txt(threadName,start_number,end_number):
for
i
in
range
(start_number,end_number):
file
=
"d:/data/147//147/"
+
threadName
+
"/"
+
str
(i)
+
".txt"
with
open
(
file
,
'r'
,encoding
=
'utf-8'
) as f:
news
=
f.read()
this_news
=
""
for
ci
in
jieba.cut(news):
if
ci
not
in
stopword:
this_news
=
this_news
+
ci
+
" "
data.append(this_news)
label.append(threadName)
def
get_data():
return
data,label
def
my_new_thread():
'''thread1 = myThread(1, '财经', 798977, 836075)
thread2 = myThread(2, '彩票', 256822, 264410)
thread3 = myThread(3, '房产', 264410, 284460)
thread4 = myThread(4, '股票', 644579, 798977)'''
thread1
=
myThread(
1
,
'财经'
,
798977
,
810000
)
thread2
=
myThread(
2
,
'彩票'
,
256822
,
260000
)
thread3
=
myThread(
3
,
'房产'
,
264410
,
270000
)
thread4
=
myThread(
4
,
'股票'
,
644579
,
700000
)
thread5
=
myThread(
5
,
'财经'
,
810000
,
836075
)
thread6
=
myThread(
6
,
'彩票'
,
260000
,
264410
)
thread7
=
myThread(
7
,
'房产'
,
270000
,
284460
)
thread8
=
myThread(
8
,
'股票'
,
700000
,
750000
)
thread9
=
myThread(
9
,
'股票'
,
750000
,
798977
)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread6.start()
thread7.start()
thread8.start()
thread9.start()
threads
=
[thread1,thread2,thread3,thread4,thread6,thread5,thread7,thread8,thread9]
for
t
in
threads:
t.join()
print
(
'退出线程'
)
def
my_main():
my_new_thread()
data, label
=
get_data()
return
data,label
'''if __name__ == '__main__':
my_new_thread()
data, label = get_data()
np.save("d:/data.npy",np.array(data))
np.save("d:/label.npy", np.array(label))
print("=======================")
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0, stratify=label)
X_train, X_test, vectorizer = xiangliang(x_train, x_test)
y_nb_pred, clf = beiNB(X_train, y_train, X_test)
result(vectorizer, clf,y_nb_pred)'''
/
em>
|