大作业

1. 读取数据集

2. 训练集与测试集划分

3. 线性回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。

4. 多项式回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。

5. 比较线性模型与非线性模型的性能,并说明原因。

复制代码
# 多元线性回归模型
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

# 波士顿房价数据集
data = load_boston()

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(data.data,data.target,test_size=0.3)

# 建立线性回归模型
from sklearn.linear_model import LinearRegression
bos_lg = LinearRegression()
bos_lg.fit(x_train,y_train)
print('系数',bos_lg.coef_,"\n截距",bos_lg.intercept_)

# 检测模型好坏
from sklearn.metrics import regression
y_predict = bos_lg.predict(x_test)
# 计算模型的预测指标
print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict))
print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict))
# 打印模型的分数
print("模型的分数:",bos_lg.score(x_test, y_test))
print('=================\n')
# 多元多项式回归模型
# 多项式化
from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
x_poly_train = poly2.fit_transform(x_train)
x_poly_test = poly2.transform(x_test)

# 建立模型
bos_lgp = LinearRegression()
bos_lgp.fit(x_poly_train, y_train)

# 预测
y_predict2 = bos_lgp.predict(x_poly_test)
# 检测模型好坏
# 计算模型的预测指标
print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict2))
print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict2))
# 打印模型的分数
print("模型的分数:",bos_lgp.score(x_poly_test, y_test))
复制代码

 

二、中文文本分类

按学号未位下载相应数据集。

147:财经、彩票、房产、股票、

258:家居、教育、科技、社会、时尚、

0369:时政、体育、星座、游戏、娱乐

分别建立中文文本分类模型,实现对文本的分类。基本步骤如下:

1.各种获取文件,写文件

2.除去噪声,如:格式转换,去掉符号,整体规范化

3.遍历每个个文件夹下的每个文本文件。

4.使用jieba分词将中文文本切割。

中文分词就是将一句话拆分为各个词语,因为中文分词在不同的语境中歧义较大,所以分词极其重要。

可以用jieba.add_word('word')增加词,用jieba.load_userdict('wordDict.txt')导入词库。

维护自定义词库

5.去掉停用词。

维护停用词表

6.对处理之后的文本开始用TF-IDF算法进行单词权值的计算

7.贝叶斯预测种类

8.模型评价

9.新文本类别预测

模型

复制代码
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  MultinomialNB
from sklearn.linear_model import LinearRegression
from myThread import my_main
import collections
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
'''def get_data():
    data = []
    stopword = get_stopword()
    label = []
    for i in range(644580,644602):#股票
        file = "d:/data/147/temp/股票/"+str(i)+".txt"
        with open(file,'r',encoding='utf-8') as f:
            news = f.read()
        this_news = ""
        for ci in jieba.cut(news):
            if ci not in stopword:
                this_news = this_news+ci+" "
        data.append(this_news)
        label.append('股票')

    for i in range(264410,264429):#房产
        file = "d:/data/147/temp/房产/" + str(i) + ".txt"
        with open(file, 'r', encoding='utf-8') as f:
            news = f.read()
        this_news = ""
        for ci in jieba.cut(news):
            if ci not in stopword:
                this_news = this_news + ci + " "
        data.append(this_news)
        label.append('房产')

    for i in range(256822,256843):#彩票
        file = "d:/data/147/temp/彩票/" + str(i) + ".txt"
        with open(file, 'r', encoding='utf-8') as f:
            news = f.read()
        this_news = ""
        for ci in jieba.cut(news):
            if ci not in stopword:
                this_news = this_news + ci + " "
        data.append(this_news)
        label.append('彩票')

    for i in range(798977,798999):#财经
        file = "d:/data/147/temp/财经/" + str(i) + ".txt"
        with open(file, 'r', encoding='utf-8') as f:
            news = f.read()
        this_news = ""
        for ci in jieba.cut(news):
            if ci not in stopword:
                this_news = this_news + ci + " "
        data.append(this_news)
        label.append('财经')
    return data,label
def get_stopword():
    #加载停用词表
    stopwords = [line.strip() for line in open('stopword.txt', 'r',encoding='utf-8').readlines()]
    stopwords.append('\u3000')
    stopwords.append('\n')
    return stopwords
'''
def xiangliang(x_train,x_test):
    # 向量化
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), strip_accents='unicode')  # ,norm='12'
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)
    return x_train, x_test, vectorizer

def beiNB(x_train, y_train,x_test):
    # 朴素贝叶斯分类器
    clf = MultinomialNB().fit(x_train, y_train)
    y_nb_pred = clf.predict(x_test)

    return y_nb_pred,clf

def result(vectorizer,clf):
    # 分类结果
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    print('====================shape')
    print(y_nb_pred.shape, y_nb_pred)
    print('nb_confusion_matrix:')
    cm = confusion_matrix(y_test, y_nb_pred)
    print(cm)
    cr = classification_report(y_test, y_nb_pred)
    print(cr)

    feature_names = vectorizer.get_feature_names()
    coefs = clf.coef_
    intercept = clf.intercept_
    coefs_with_fns = sorted(zip(coefs[0], feature_names))

    n = 10
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    print('=================coef')
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))

if __name__ == '__main__':

    data,label = my_main()
    print(len(data))
    print(label,len(label))
    x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0,stratify=label)
    X_train, X_test, vectorizer = xiangliang(x_train, x_test)
    y_nb_pred, clf = beiNB(X_train, y_train, X_test)
    result(vectorizer, clf)
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

    # 统计测试集和预测集的各类新闻个数
    testCount = collections.Counter(y_test)
    predCount = collections.Counter(y_nb_pred)
    print('实际:', testCount, '\n', '预测', predCount)

    # 建立标签列表,实际结果列表,预测结果列表,
    nameList = list(testCount.keys())
    testList = list(testCount.values())
    predictList = list(predCount.values())
    x = list(range(len(nameList)))
    print("新闻类别:", nameList, '\n', "实际:", testList, '\n', "预测:", predictList)

    # 画图
    plt.figure(figsize=(7, 5))
    total_width, n = 0.6, 2
    width = total_width / n
    plt.bar(x, testList, width=width, label='实际', fc='g')
    for i in range(len(x)):
        x[i] = x[i] + width
    plt.bar(x, predictList, width=width, label='预测', tick_label=nameList, fc='b')
    plt.grid()
    plt.title('实际和预测对比图', fontsize=17)
    plt.xlabel('新闻类别', fontsize=17)
    plt.ylabel('频数', fontsize=17)
    plt.legend(fontsize=17)
    plt.tick_params(labelsize=15)
    plt.show()
复制代码

 

mythread.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
id = "__mceDel" > import  jieba
import  threading
#from nlt_cut import get_stopword,xiangliang,beiNB,result
from  sklearn.model_selection  import  train_test_split
import  numpy as np
class  myThread(threading.Thread):
     '''读取文件的线程类'''
     def  __init__( self ,threadID,name,start_number,end_number):
         threading.Thread.__init__( self )
         self .threadID  =  threadID
         self .name  =  name
         self .start_number  =  start_number
         self .end_number  =  end_number
     def  run( self ):
         print ( '读取文件开始:' + self .name)
         read_txt( self .name, self .start_number, self .end_number)
         print ( '读取文件结束' + self .name)
 
def  get_stopword():
     '''加载停用词表'''
     stopwords  =  [line.strip()  for  line  in  open ( 'stopword.txt' 'r' ,encoding = 'utf-8' ).readlines()]
     stopwords.append( '\u3000' )
     stopwords.append( '\n' )
     return  stopwords
 
data  =  []
label  =  []
stopword  =  get_stopword()
def  read_txt(threadName,start_number,end_number):
 
     for  in  range (start_number,end_number):
         file  =  "d:/data/147//147/" + threadName + "/" + str (i) + ".txt"
         with  open ( file , 'r' ,encoding = 'utf-8' ) as f:
             news  =  f.read()
         this_news  =  ""
         for  ci  in  jieba.cut(news):
             if  ci  not  in  stopword:
                 this_news  =  this_news + ci + " "
         data.append(this_news)
         label.append(threadName)
def  get_data():
     return  data,label
 
def  my_new_thread():
     '''thread1 = myThread(1, '财经', 798977, 836075)
     thread2 = myThread(2, '彩票', 256822, 264410)
     thread3 = myThread(3, '房产', 264410, 284460)
     thread4 = myThread(4, '股票', 644579, 798977)'''
     thread1  =  myThread( 1 '财经' 798977 810000 )
     thread2  =  myThread( 2 '彩票' 256822 260000 )
     thread3  =  myThread( 3 '房产' 264410 270000 )
     thread4  =  myThread( 4 '股票' 644579 700000 )
     thread5  =  myThread( 5 '财经' 810000 836075 )
     thread6  =  myThread( 6 '彩票' 260000 264410 )
     thread7  =  myThread( 7 '房产' 270000 284460 )
     thread8  =  myThread( 8 '股票' 700000 750000 )
     thread9  =  myThread( 9 '股票' 750000 798977 )
     thread1.start()
     thread2.start()
     thread3.start()
     thread4.start()
     thread5.start()
     thread6.start()
     thread7.start()
     thread8.start()
     thread9.start()
     threads  =  [thread1,thread2,thread3,thread4,thread6,thread5,thread7,thread8,thread9]
     for  in  threads:
         t.join()
     print ( '退出线程' )
def  my_main():
     my_new_thread()
     data, label  =  get_data()
     return  data,label
 
'''if __name__ == '__main__':
     my_new_thread()
     data, label = get_data()
     np.save("d:/data.npy",np.array(data))
     np.save("d:/label.npy", np.array(label))
     print("=======================")
     x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0, stratify=label)
     X_train, X_test, vectorizer = xiangliang(x_train, x_test)
     y_nb_pred, clf = beiNB(X_train, y_train, X_test)
     result(vectorizer, clf,y_nb_pred)'''
<
/ em>

你可能感兴趣的:(大作业)