Python入门经典学习1-乳腺癌分类问题

基于肿瘤特征判定是恶性肿瘤还是良性肿瘤,通过研究699个患者的肿瘤属性,找到肿瘤预测模式,根据肿瘤属性来判定肿瘤性质,对没有见过见过面的患者,根据属性来判定是否为恶性肿瘤。

用到的数据:链接:http://pan.baidu.com/s/1c26Dbjy 密码:gllb

###########################################
#        分类器:肿瘤良性还是恶性
###########################################



###########################################
#      读入数据集,并得到元祖列表
###########################################
def ReadSet(FileName):
    TrainSet = []
    TrainFile = open(FileName)
    for line in TrainFile:
        line = line.strip()             #去掉'\n'
        if '?' in line:                 #注意:引号中间不要有空格,去掉含有问号的坏数据
            continue
        id,a1,a2,a3,a4,a5,a6,a7,a8,a9,diag = line.split(',')#以逗号分开
        if diag == '4':
            diagMorB = 'm'
        else:
            diagMorB = 'b'
        PatientTuple = (id,diagMorB,int(a1),int(a2),int(a3),int(a4),int(a5),\
                        int(a6),int(a7),int(a8),int(a9))
        TrainSet.append(PatientTuple)
    return TrainSet
###########################################
#           训练分类器
###########################################
def sumLists(list1,list2):
    listofsums =[0.0] * 9
    for index in range(9):
        listofsums[index] = list1[index] + list2[index]
    return listofsums

def makeAverages(listofsums,total):
    averageList =[0.0] * 9
    for index in range(9):
        averageList[index] = listofsums[index]  / float(total)
    return averageList

def Classifier(TrainSet):
    benignSums = [0] * 9
    benignCount = 0
    malignantSums = [0] * 9
    malignantCount = 0

    for patientTup in TrainSet:
        if patientTup[1] == 'b':
            benignSums = sumLists(benignSums,patientTup[2:])
            benignCount += 1
        else:
            malignantSums = sumLists(malignantSums,patientTup[2:])
            malignantCount += 1

    benignAvgs = makeAverages(benignSums,benignCount)
    malignantAvgs = makeAverages(malignantSums,malignantCount)

    classifier = makeAverages(sumLists(benignAvgs,malignantAvgs),2)
    return classifier
###########################################
#           测试分类器
###########################################
def Test(TestSet,classifier):
    results = []
    for patient in TestSet:
        benignCount = 0
        malignantCount = 0
        for index in range(9):
            if patient[index + 2] > classifier[index]:#注意索引值加2才是属性值
               malignantCount += 1
            else:
               benignCount += 1
        resultTuple = (patient[0],benignCount,malignantCount,patient[1])
        results.append(resultTuple)
    return results
###########################################
#           格式化输出测试结果
###########################################
def ShowResult(Result):
    totalCount = 0
    wrongcount = 0

    for r in Result:
        totalCount += 1
        if r[1] > r[2]:
            if r[3] == 'm':
                wrongcount += 1
        elif r[3] == 'b':
            wrongcount += 1
    print("%d patients,there were %d wrong" %(totalCount,wrongcount))
###########################################
#           主函数
###########################################  
def main():

    print("Reading in train data ...")
    TrainFileName = "C:\\Python36\\code\\RuXian\\fullTrainData.txt"
    TrainSet = ReadSet(TrainFileName)
    #print(TrainSet)
    print("Read TrainSet Done!")

    print("Begin Training...")
    classifier = Classifier(TrainSet)
    print("Train Classifier Done!")

    print("Reading in test data ...")
    TestFileName = "C:\\Python36\\code\\RuXian\\fullTestData.txt"
    TestSet = ReadSet(TestFileName)
    print("Read TestSet Done!")

    print("Begin Testing...")
    Result = Test(TestSet,classifier)
    #print(Result)
    print("Test  Done!")

    ShowResult(Result)
    print ("program finished.\n")
参考:《Pthon入门经典学习书》

你可能感兴趣的:(Python)