import
np.genfromtxt
pd.read_csv
load_boston
load_iris
load_digits()
显示前5行数据:data.head()
显示完整数据:pd.DataFrame(data)
查看数据信息:data.info()
数据描述:data.describe()
数据形状:data.shape
数据类型分布:data.dtypes
查看标签分布:data.Target.value_counts()
使用柱状图画出标签个数统计:data.Target.value_counts().plot(kind="bar")
、msno.bar(data)
:需引入missingno
库
对于决策树:叶子类别数data.species.unique()
可视化数据分布:sns.pairplot(data,hue="Target")
一般用来将属性里面的0值转换为空值,再填补空值
0替换空值:data[colume] = diabetes_data[colume].replace(0,np.nan)
查看空值的分布情况:
import missingno as msno
p=msno.bar(diabetes_data)
设定阈值,将空值特别多的属性列直接删掉:
thresh_count = data.shape[0]*0.8
#若某一列数据缺失的数量超过20%就会被删除
data = data.dropna(thresh=thresh_count,axis=1)
p=msno.bar(data)
填补空值:
#导入插补库
from sklearn.impute import SimpleImputer
#对数值型变量的缺失值,我们均采用均值mean插补的方法来填充缺失值
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
#进行插补
data[colume] = imr.fit_transform(data[colume])
p = msno.bar(data)
#使用特征的平均值给特征中的缺失值进行填充,填充后所有的数据就没有空值了。
或者导入pandas库:
#空余的属性列填充整体属性的中值
titanic[colume] = data[colume].fillna(data[colume].median())
print(data.describe())
注意分出特征和标签
直观显示出两个特征之间的相关性,数值为两个变量之间的相关系数。
数值范围是-1到1之间,大于0表示两个数据是正相关的,小于0表示两个数据是负相关的,等于0就是不相关。
plt.figure(figsize=(15,15))
p = sns.heatmap(df.corr(),annot=True,square=True)
#或者参数选择
p = sns.heatmap(data.corr(), annot=True, annot_kws = {'fontsize' : 15 }, square=True)
使用np库:
x_train = np.delete(x_train,[0,1,2],axis=1)
x_test = np.delete(x_test,[0,1,2],axis=1)
默认方式删除:
data.drop([colume], axis=1)
将特征列的值都改为数值型,方便模型建立。在pd
中使用iloc()
取值。
#将字符串类型的属性转换为数值型
from sklearn.preprocessing import LabelEncoder
#将属性列更改为数值型
labelencoder1 = LabelEncoder()
x_train[:,1] = labelencoder1.fit_transform(x_train[:,1])
x_test[:,1] = labelencoder1.transform(x_test[:,1])
查看转换的类:labelencoder.classes_
或者导入pandas库来转:
print(titanic["Sex"].unique())
#male 0,female 1
#loc()为pandas取数据的函数,参数逗号前代表行,逗号后代表列
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
print(titanic["Embarked"].unique())
#数据填充
titanic["Embarked"] = titanic["Embarked"].fillna('S')
#把类别变成数字
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
或者:
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})
将数据类型转为为float32,把属性里面的引号去掉
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)
或者:
使用get_dummies
进行one hot
编码
# 字符串分类转整数分类
conver_cols = ['素材类型', '广告类型', '合作方式', '广告尺寸', '广告卖点']
# 用get_dummies进行one hot编码
dummy_df = pd.get_dummies(df[conver_cols])
# 清除原来的特征
df2 = df.drop(conver_cols, axis=1)
# 当axis = 1的时候,concat就是行对齐,然后将不同列名称的两张表合并
df2 = pd.concat([df2, dummy_df], axis=1)
将数值范围过大的特征列进行处理,使其整合到一个较小的范围中(-1~1)。将数据点映射到了[-1,1]区间
#数据的标准化处理
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
stratify可以确保切分后的数据标签比例还是切分前的比例
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, stratify=y_data)
如果模型有要求,数据必须为二维,则有时还需要给一维的数据加维度:
x_train = x_train[:, np.newaxis]
x_test = x_test[:,np.newaxis]
选定特征:
# 选定特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
x_data = titanic[predictors]
y_data = titanic["Survived"]
#打乱数据
data_size = data.shape[0]
index = [i for i in range(data_size)]
random.shuffle(index)
data = data[index]
target = target[index]
#切分数据集
test_size = 40
x_train = data[test_size:]
x_test = data[:test_size]
y_train = target[test_size:]
y_test = target[:test_size]
#逻辑回归
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
#建立模型
LR = LogisticRegression()
#计算交叉验证的误差
scores = model_selection.cross_val_score(LR, x_data, y_data, cv=3)
#求平均
print(scores.mean())
#导入KNN算法
from sklearn.neighbors import KNeighborsClassifier
#保存不同k值测试集准确率
test_scores = []
#保存不同k值训练集准确率
train_scores = []
x_train_values = x_train.values
y_train_values = y_train.values
#设置30个k值
k = 30
for i in range(1,k):
knn = KNeighborsClassifier(i)
knn.fit(x_train,y_train)
#保存测试集准确率
test_scores.append(knn.score(x_test,y_test))
#保存训练集准确率
train_scores.append(knn.score(x_train,y_train))
或者:
#KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(21)
#交叉验证
scores = model_selection.cross_val_score(knn, x_data, y_data, cv=3)
#求平均
scores.mean()
hidden_layer_sizes
:表示隐藏层的属性
max_iter
:最大迭代次数
mlp = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500)
mlp.fit(x_train, y_train)
或者:
#神经网络
from sklearn.neural_network import MLPClassifier
#建立模型
mlp = MLPClassifier(hidden_layer_sizes=(20,10), max_iter=1000)
#计算交叉验证的误差
scores = model_selection.cross_val_score(mlp, x_data, y_data, cv=3)
#求平均
print(scores.mean())
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
决策树过拟合优化:训练集准确度非常高,但是测试集结果很低。(调参)
# max_depth:树的最大深度
# min_samples_split:内部节点再划分所需最小样本数
# min_samples_leaf:叶子节点最少样本数
param_grid = {'max_depth': [30,40,50,60,70],
'min_samples_split': [2,3,4,5,6],
'min_samples_leaf':[1,2,3,4]}
# 网格搜索 调参 自动尝试不同的组合 5*5*4
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3)
model.fit(x_train, y_train)
print(model.best_estimator_)
或者:
#决策树
from sklearn import tree
#决策树模型
dtree = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=4)
#交叉验证
score = model_selection.cross_val_score(dtree, x_data, y_data, cv=3)
score.mean()
#随机森林
from sklearn.ensemble import RandomForestClassifier
RF1 = RandomForestClassifier(random_state=1, n_estimators = 10, min_samples_split=2)
#交叉验证
scores = model_selection.cross_val_score(RF1, x_data, y_data, cv=3)
scores.mean()
RF2 = RandomForestClassifier(n_estimators=100, min_samples_split=4)
# 交叉验证
scores = model_selection.cross_val_score(RF2, x_data, y_data, cv=3)
# 求平均
print(scores.mean())
#Adaboost
from sklearn.ensemble import AdaBoostClassifier
#建立模型
adaboost = AdaBoostClassifier(bagging_clf, n_estimators=10)
#交叉验证
scores = model_selection.cross_val_score(adaboost, x_data, y_data, cv=3)
# 求平均
print(scores.mean())
#Stacking
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier
sclf = StackingClassifier(classifiers=[bagging_clf, mlp, LR],
meta_classifier=LogisticRegression())
sclf2 = VotingClassifier([('adaboost',adaboost), ('mlp',mlp), ('LR',LR),('knn',knn),('dtree',dtree)])
# 计算交叉验证的误差
scores = model_selection.cross_val_score(sclf2, x_data, y_data, cv=3)
# 求平均
print(scores.mean())
此方法是构建单词的字典,每个单词实例被转换为特征向量的一个数值特征,每个元素是特定单词在文本中出现的次数
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
cv = CountVectorizer()
cv_data = cv.fit_transform(x_train)
mul_nb = MultinomialNB()
scores = model_selection.cross_val_score(mul_nb, cv_data, y_train, cv=3, scoring='accuracy')
print("Accuracy: %0.3f" % (scores.mean()))
这是一个衡量一个词在文本或语料中重要性的统计方法。直觉上讲,该方法通过比较在整个语料库的词的频率,寻求在当前文档中频率较高的词。这是一种将结果进行标准化的方法,可以避免因为有些词出现太过频繁而对一个实例的特征化作用不大的情况.
from sklearn.feature_extraction.text import TfidfVectorizer
# 创建变换函数
vectorizer = TfidfVectorizer()
# 词条化以及创建词汇表
tfidf_train = vectorizer.fit_transform(x_train)
scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy')
print("Accuracy: %0.3f" % (scores.mean()))
使用停用词优化:
def get_stop_words():
result = set()
for line in open('stopwords_en.txt', 'r').readlines():
result.add(line.strip())
return result
# 加载停用词
stop_words = get_stop_words()
# 创建变换函数
vectorizer = TfidfVectorizer(stop_words=stop_words)
mul_nb = MultinomialNB(alpha=0.01)
# 词条化以及创建词汇表
tfidf_train = vectorizer.fit_transform(x_train)
scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy')
print("Accuracy: %0.3f" % (scores.mean()))
在切分数据集时使用:
# 切分数据集
tfidf_data = vectorizer.fit_transform(news.data)
x_train,x_test,y_train,y_test = train_test_split(tfidf_data,news.target)
mul_nb.fit(x_train,y_train)
print(mul_nb.score(x_train, y_train))
print(mul_nb.score(x_test, y_test))
可能会出现过拟合。
# 肘部法则
loss = []
for i in range(2,10):
model = KMeans(n_clusters=i).fit(X)
loss.append(model.inertia_)
plt.plot(range(2,10),loss)
plt.xlabel('k')
plt.ylabel('loss')
plt.show()
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
classifiers = [
KNeighborsClassifier(3),
LogisticRegression(),
MLPClassifier(hidden_layer_sizes=(20,50),max_iter=10000),
DecisionTreeClassifier(),
RandomForestClassifier(max_depth=9,min_samples_split=3),
AdaBoostClassifier(),
BaggingClassifier(),
]
log = []
for clf in classifiers:
clf.fit(x_train, y_train)
name = clf.__class__.__name__
print("="*30)
print(name)
print('****Results****')
test_predictions = clf.predict(x_test)
acc = accuracy_score(y_test, test_predictions)
print("Accuracy: {:.4%}".format(acc))
log.append([name, acc*100])
print("="*30)
log = pd.DataFrame(log)
log
log.rename(columns={0: 'Classifier', 1:'Accuracy'}, inplace=True)
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()
plt.title('KNN Varying number of neighbors')
plt.plot(range(1,k),test_scores,label="Test")
plt.plot(range(1,k),train_scores,label="Train")
plt.legend()
plt.xticks(range(1,k))
plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()
#选取一个最好的k值作为模型参数
k = np.argmax(test_scores)+1
knn = KNeighborsClassifier(k)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
模型的散点图,画出一条预测的回归线(回归问题),画表头和xy坐标的描述
plt.scatter(x_train, y_train, color='b')
plt.plot(x_train, model.predict(x_train), color='r', linewidth=2)
plt.title('Age Vs Quality (Training set)')
plt.xlabel('Age')
plt.ylabel('Quality')
plt.show()
混淆矩阵是机器学习中总结分类模型预测结果的情形分析表,以矩阵形式将数据集中的记录按照真实的类别与分类模型预测的类别判断两个标准进行汇总。
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_prediction,y_test)
#引入pd
df_cm = pd.DataFrame(confusion)
#以热力图形式展示
sns.heatmap(df_cm,annot=True)
以图表的形式展示 精确度/召回率/F1值
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
综合了岭回归和LASSO 等共同的特点,不是在取绝对值还是平方中抉择,而是选定一个系数α 和 1-α 来作为权重比。
from sklearn.linear_model import ElasticNetCV
EN = ElasticNetCV()
EN.fit(x_train, y_train)