1、读取CSV文件
data = pd.read_csv(r'.\')
2、看数据大致情况
data.info()
3、当表很大的时候想看表长什么样子
data.head(n)#显示前n行
4、删除表的若干列
data.trop(['aaa','bbb','ccc'],inplace = True,axis = 1)#删除表头为aaa,bbb,ccc的列,inplace = True并用后来的表去覆盖前面的表
5、用均值填补缺失值
data['age'] = data['age'].fillna(data['age'].mean())
6、删掉有缺失值的行
data = data.drpona()#默认axis = 0
7、看一列表头里面到底有多少类别
data['ddd'].unique()
8、讲上面的类别[‘s’,‘q’,‘k’]转化为[0,1,2]供分类
labels = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].apply(lambda x: labels.index(x))
9、把性别转换为0,1
data['Sex'] = (data['Sex'] == 'male').astype('int')
10、取出除了某一列外的数据
x = data.iloc[:,data.columns != 'kkk']
11、乱序的索引按顺序排列
xtrain.index = range(xtrain.shape[0])
12、训练并交叉验证
clf = DecisionTreeClassifier(random_state = 25)
score = cross_val_score(clf,x,y,cv=10).mean()
13、网格搜索
import numpy as np
gini_thresholds = np.linspace(0,0.5,20)
parameters = {'splitter':('best','random')
,'criterion':("gini","entropy")
,"max_depth":[*range(1,10)]
,'min_samples_leaf':[*range(1,50,5)]
,'min_impurity_decrease':[*np.linspace(0,0.5,20)]
}
clf = DecisionTreeClassifier(random_state=25)
GS = GridSearchCV(clf, parameters, cv=10)
GS.fit(Xtrain,Ytrain)
GS.best_params_
GS.best_score_