import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset=pd.read_csv('Data.csv')
在Variable explorer中可以点击查看数据集内容:
设置自变量矩阵:
X = dataset.iloc[:, :-1].values
表示把数据集的所有行和除了最后一列外的所有列赋值给自变量矩阵X。
y = dataset.iloc[:, 3].values
表示把数据集的所有行和最后一列(本数据集中最后一列为第3列)赋值给因变量向量y。
缺失数据(Missing Data):第4行的Salary缺失以及第6行的Age缺失。
用相应列的平均数处理缺失数据:
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])
onehotencoder=OneHotEncoder(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()
与dataset对比:
y:
labelencoder_y=LabelEncoder()
y=labelencoder_y.fit_transform(y)
与dataset对比:
将数据分成训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
dataset=read.csv('Data.csv')
在Environment中可以点击查看数据集内容:
缺失数据(Missing Data):第4行Salary缺失以及第6行的Age缺失。
用相应列的平均数处理缺失数据:
dataset$Age[is.na(dataset$Age)]=mean(dataset$Age,na.rm = T)
dataset$Salary[is.na(dataset$Salary)]=mean(dataset$Salary,na.rm = T)
dataset$Country=factor(dataset$Country,
levels = c('France','Spain','Germany'),
labels = c(1,2,3))
dataset$Purchased=factor(dataset$Purchased,
levels = c('No','Yes'),
labels = c(0,1))
install.packages('caTools')
在Packages勾选刚下好的包
分类数据:
set.seed(123)
split=sample.split(dataset$Purchased,SplitRatio = 0.8)
training_set=subset(dataset,split==TRUE)
test_set=subset(dataset,split==FALSE)
training_set[,2:3]=scale(training_set[,2:3])
test_set[,2:3]=scale(test_set[,2:3])