import pandas as pd
train_data = pd.read_csv("train.csv")
train_data.shape #应该是给了property
(891, 12)
train_data.describe()
train_data["Age"].fillna(value=train_data["Age"].mean())
ter
from sklearn.preprocessing import Imputer
class
help(Imputer) #这是个class
#axis=0指定填充列,1指定填充行,初始化imputer类
imp = Imputer(missing_values='NaN',strategy='mean',axis=0) #axis=0指定填充列,1指定填充行,初始化imputer类
e =
age = imp.fit_transform(train_data[["Age"]].values) #fit_transform 两个步骤,fit读取数据计算,transform完成填充。如果只要拟合就用fit。
train_data.loc[:,"Age"] = train_data["Age"].fillna(value=train_data["Age"].mean()) #[x,y] x行,y列。:代表所有。把右边填充好的赋值给左边。
train_data.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
操作
#常见的工程操作
#数值型
#数值型
##幅度变换
)
#取对数
import numpy as np
log_age = train_data["Age"].apply(lambda x: np.log(x)) #Applies function along input axis of DataFrame.DataFrame.apply(func, axis=0)
#最大最小值缩放 公式:xnorm = (x - xmin)/(xmax - xmin) 归一化
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
fare_mm = mm_scaler.fit_transform(train_data[["Fare"]])
#标准化缩放 standardscaler xstand = (x - u)/σ u平均值 σ标准差
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
fare_sds = sds.fit_transform(train_data[["Fare"]])
#1.统计值
max_age = train_data["Age"].max()
min_age = train_data["Age"].min()
#分位数
age_quarter_1 = train_data["Age"].quantile(0.25)
age_quarter_1
22.0
eatures
#高次特征和交叉特征
from sklearn.preprocessing import PolynomialFeatures
pnf = PolynomialFeatures(degree = 2) #degree 多项式的阶数,一般默认是2。
age_pnf = pnf.fit_transform(train_data[["SibSp","Parch"]])
六列
age_pnf #[1,a,b,a^2,a*b,b^] 六列
array([[ 1., 1., 0., 1., 0., 0.],
[ 1., 1., 0., 1., 0., 0.],
[ 1., 0., 0., 0., 0., 0.],
...,
[ 1., 1., 2., 1., 2., 4.],
[ 1., 0., 0., 0., 0., 0.],
[ 1., 0., 0., 0., 0., 0.]])
#离散化,分箱,分桶。把值平均分配。cut和qcut
#cut 等距切分,1-100岁等距切分4分就是0-25是一个箱,26-50是一个。然后年龄落入哪个箱就划分进去。
train_data.loc[:,"Fare_cut"] = pd.cut(train_data["Fare"],5) #五个分割成四个区间。
train_data.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Fare_cut
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S (-0.512, 102.466]
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C (-0.512, 102.466]
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S (-0.512, 102.466]
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S (-0.512, 102.466]
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S (-0.512, 102.466]
train_data["Fare_cut"].unique()
[(-0.512, 102.466], (204.932, 307.398], (102.466, 204.932], (409.863, 512.329]]
Categories (4, object): [(-0.512, 102.466] < (102.466, 204.932] < (204.932, 307.398] < (409.863, 512.329]]
,5
#等频切分 qcut 按照频率去切分,让每个区间中的数目一样,频率一样。
train_data.loc[:,"Fare_qcut"] = pd.qcut(train_data["Fare"],5)
train_data["Fare_qcut"].unique()
[[0, 7.854], (39.688, 512.329], (7.854, 10.5], (10.5, 21.679], (21.679, 39.688]]
Categories (5, object): [[0, 7.854] < (7.854, 10.5] < (10.5, 21.679] < (21.679, 39.688] < (39.688, 512.329]]
#one hot encoding 独热向量编码 但是会稀释样本特征,造成数据量增大
embarked_ohe = pd.get_dummies(train_data[['Embarked']])
embarked_ohe.head()
C Q S
0 0 0 1
1 1 0 0
2 0 0 1
3 0 0 1
4 0 0 1
fareqcut_ohe = pd.get_dummies(train_data["Fare_qcut"])
fareqcut_ohe.head()
[0, 7.854] (7.854, 10.5] (10.5, 21.679] (21.679, 39.688] (39.688, 512.329]
0 1 0 0 0 0
1 0 0 0 0 1
2 0 1 0 0 0
3 0 0 0 0 1
4 0 1 0 0 0
对日期处理
#时间型的特征处理 对日期处理
car_sales = pd.read_csv("")
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head(10)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
df_train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
df_train.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
df_train[['Age']].values
array([[ 22. ],
[ 38. ],
[ 26. ],
[ 35. ],
[ 35. ],
[ nan],
[ 54. ],
[ 2. ],
[ 27. ],
[ 14. ],
[ 4. ],
[ 58. ],
[ 20. ],
[ 39. ],
[ 14. ],
[ 55. ],
[ 2. ],
[ nan],
[ 31. ],
[ nan],
[ 35. ],
[ 34. ],
[ 15. ],
[ 28. ],
[ 8. ],
[ 38. ],
[ nan],
[ 19. ],
[ nan],
[ nan],
[ 40. ],
[ nan],
[ nan],
[ 66. ],
[ 28. ],
[ 42. ],
[ nan],
[ 21. ],
[ 18. ],
[ 14. ],
[ 40. ],
[ 27. ],
[ nan],
[ 3. ],
[ 19. ],
[ nan],
[ nan],
[ nan],
[ nan],
[ 18. ],
[ 7. ],
[ 21. ],
[ 49. ],
[ 29. ],
[ 65. ],
[ nan],
[ 21. ],
[ 28.5 ],
[ 5. ],
[ 11. ],
[ 22. ],
[ 38. ],
[ 45. ],
[ 4. ],
[ nan],
[ nan],
[ 29. ],
[ 19. ],
[ 17. ],
[ 26. ],
[ 32. ],
[ 16. ],
[ 21. ],
[ 26. ],
[ 32. ],
[ 25. ],
[ nan],
[ nan],
[ 0.83],
[ 30. ],
[ 22. ],
[ 29. ],
[ nan],
[ 28. ],
[ 17. ],
[ 33. ],
[ 16. ],
[ nan],
[ 23. ],
[ 24. ],
[ 29. ],
[ 20. ],
[ 46. ],
[ 26. ],
[ 59. ],
[ nan],
[ 71. ],
[ 23. ],
[ 34. ],
[ 34. ],
[ 28. ],
[ nan],
[ 21. ],
[ 33. ],
[ 37. ],
[ 28. ],
[ 21. ],
[ nan],
[ 38. ],
[ nan],
[ 47. ],
[ 14.5 ],
[ 22. ],
[ 20. ],
[ 17. ],
[ 21. ],
[ 70.5 ],
[ 29. ],
[ 24. ],
[ 2. ],
[ 21. ],
[ nan],
[ 32.5 ],
[ 32.5 ],
[ 54. ],
[ 12. ],
[ nan],
[ 24. ],
[ nan],
[ 45. ],
[ 33. ],
[ 20. ],
[ 47. ],
[ 29. ],
[ 25. ],
[ 23. ],
[ 19. ],
[ 37. ],
[ 16. ],
[ 24. ],
[ nan],
[ 22. ],
[ 24. ],
[ 19. ],
[ 18. ],
[ 19. ],
[ 27. ],
[ 9. ],
[ 36.5 ],
[ 42. ],
[ 51. ],
[ 22. ],
[ 55.5 ],
[ 40.5 ],
[ nan],
[ 51. ],
[ 16. ],
[ 30. ],
[ nan],
[ nan],
[ 44. ],
[ 40. ],
[ 26. ],
[ 17. ],
[ 1. ],
[ 9. ],
[ nan],
[ 45. ],
[ nan],
[ 28. ],
[ 61. ],
[ 4. ],
[ 1. ],
[ 21. ],
[ 56. ],
[ 18. ],
[ nan],
[ 50. ],
[ 30. ],
[ 36. ],
[ nan],
[ nan],
[ 9. ],
[ 1. ],
[ 4. ],
[ nan],
[ nan],
[ 45. ],
[ 40. ],
[ 36. ],
[ 32. ],
[ 19. ],
[ 19. ],
[ 3. ],
[ 44. ],
[ 58. ],
[ nan],
[ 42. ],
[ nan],
[ 24. ],
[ 28. ],
[ nan],
[ 34. ],
[ 45.5 ],
[ 18. ],
[ 2. ],
[ 32. ],
[ 26. ],
[ 16. ],
[ 40. ],
[ 24. ],
[ 35. ],
[ 22. ],
[ 30. ],
[ nan],
[ 31. ],
[ 27. ],
[ 42. ],
[ 32. ],
[ 30. ],
[ 16. ],
[ 27. ],
[ 51. ],
[ nan],
[ 38. ],
[ 22. ],
[ 19. ],
[ 20.5 ],
[ 18. ],
[ nan],
[ 35. ],
[ 29. ],
[ 59. ],
[ 5. ],
[ 24. ],
[ nan],
[ 44. ],
[ 8. ],
[ 19. ],
[ 33. ],
[ nan],
[ nan],
[ 29. ],
[ 22. ],
[ 30. ],
[ 44. ],
[ 25. ],
[ 24. ],
[ 37. ],
[ 54. ],
[ nan],
[ 29. ],
[ 62. ],
[ 30. ],
[ 41. ],
[ 29. ],
[ nan],
[ 30. ],
[ 35. ],
[ 50. ],
[ nan],
[ 3. ],
[ 52. ],
[ 40. ],
[ nan],
[ 36. ],
[ 16. ],
[ 25. ],
[ 58. ],
[ 35. ],
[ nan],
[ 25. ],
[ 41. ],
[ 37. ],
[ nan],
[ 63. ],
[ 45. ],
[ nan],
[ 7. ],
[ 35. ],
[ 65. ],
[ 28. ],
[ 16. ],
[ 19. ],
[ nan],
[ 33. ],
[ 30. ],
[ 22. ],
[ 42. ],
[ 22. ],
[ 26. ],
[ 19. ],
[ 36. ],
[ 24. ],
[ 24. ],
[ nan],
[ 23.5 ],
[ 2. ],
[ nan],
[ 50. ],
[ nan],
[ nan],
[ 19. ],
[ nan],
[ nan],
[ 0.92],
[ nan],
[ 17. ],
[ 30. ],
[ 30. ],
[ 24. ],
[ 18. ],
[ 26. ],
[ 28. ],
[ 43. ],
[ 26. ],
[ 24. ],
[ 54. ],
[ 31. ],
[ 40. ],
[ 22. ],
[ 27. ],
[ 30. ],
[ 22. ],
[ nan],
[ 36. ],
[ 61. ],
[ 36. ],
[ 31. ],
[ 16. ],
[ nan],
[ 45.5 ],
[ 38. ],
[ 16. ],
[ nan],
[ nan],
[ 29. ],
[ 41. ],
[ 45. ],
[ 45. ],
[ 2. ],
[ 24. ],
[ 28. ],
[ 25. ],
[ 36. ],
[ 24. ],
[ 40. ],
[ nan],
[ 3. ],
[ 42. ],
[ 23. ],
[ nan],
[ 15. ],
[ 25. ],
[ nan],
[ 28. ],
[ 22. ],
[ 38. ],
[ nan],
[ nan],
[ 40. ],
[ 29. ],
[ 45. ],
[ 35. ],
[ nan],
[ 30. ],
[ 60. ],
[ nan],
[ nan],
[ 24. ],
[ 25. ],
[ 18. ],
[ 19. ],
[ 22. ],
[ 3. ],
[ nan],
[ 22. ],
[ 27. ],
[ 20. ],
[ 19. ],
[ 42. ],
[ 1. ],
[ 32. ],
[ 35. ],
[ nan],
[ 18. ],
[ 1. ],
[ 36. ],
[ nan],
[ 17. ],
[ 36. ],
[ 21. ],
[ 28. ],
[ 23. ],
[ 24. ],
[ 22. ],
[ 31. ],
[ 46. ],
[ 23. ],
[ 28. ],
[ 39. ],
[ 26. ],
[ 21. ],
[ 28. ],
[ 20. ],
[ 34. ],
[ 51. ],
[ 3. ],
[ 21. ],
[ nan],
[ nan],
[ nan],
[ 33. ],
[ nan],
[ 44. ],
[ nan],
[ 34. ],
[ 18. ],
[ 30. ],
[ 10. ],
[ nan],
[ 21. ],
[ 29. ],
[ 28. ],
[ 18. ],
[ nan],
[ 28. ],
[ 19. ],
[ nan],
[ 32. ],
[ 28. ],
[ nan],
[ 42. ],
[ 17. ],
[ 50. ],
[ 14. ],
[ 21. ],
[ 24. ],
[ 64. ],
[ 31. ],
[ 45. ],
[ 20. ],
[ 25. ],
[ 28. ],
[ nan],
[ 4. ],
[ 13. ],
[ 34. ],
[ 5. ],
[ 52. ],
[ 36. ],
[ nan],
[ 30. ],
[ 49. ],
[ nan],
[ 29. ],
[ 65. ],
[ nan],
[ 50. ],
[ nan],
[ 48. ],
[ 34. ],
[ 47. ],
[ 48. ],
[ nan],
[ 38. ],
[ nan],
[ 56. ],
[ nan],
[ 0.75],
[ nan],
[ 38. ],
[ 33. ],
[ 23. ],
[ 22. ],
[ nan],
[ 34. ],
[ 29. ],
[ 22. ],
[ 2. ],
[ 9. ],
[ nan],
[ 50. ],
[ 63. ],
[ 25. ],
[ nan],
[ 35. ],
[ 58. ],
[ 30. ],
[ 9. ],
[ nan],
[ 21. ],
[ 55. ],
[ 71. ],
[ 21. ],
[ nan],
[ 54. ],
[ nan],
[ 25. ],
[ 24. ],
[ 17. ],
[ 21. ],
[ nan],
[ 37. ],
[ 16. ],
[ 18. ],
[ 33. ],
[ nan],
[ 28. ],
[ 26. ],
[ 29. ],
[ nan],
[ 36. ],
[ 54. ],
[ 24. ],
[ 47. ],
[ 34. ],
[ nan],
[ 36. ],
[ 32. ],
[ 30. ],
[ 22. ],
[ nan],
[ 44. ],
[ nan],
[ 40.5 ],
[ 50. ],
[ nan],
[ 39. ],
[ 23. ],
[ 2. ],
[ nan],
[ 17. ],
[ nan],
[ 30. ],
[ 7. ],
[ 45. ],
[ 30. ],
[ nan],
[ 22. ],
[ 36. ],
[ 9. ],
[ 11. ],
[ 32. ],
[ 50. ],
[ 64. ],
[ 19. ],
[ nan],
[ 33. ],
[ 8. ],
[ 17. ],
[ 27. ],
[ nan],
[ 22. ],
[ 22. ],
[ 62. ],
[ 48. ],
[ nan],
[ 39. ],
[ 36. ],
[ nan],
[ 40. ],
[ 28. ],
[ nan],
[ nan],
[ 24. ],
[ 19. ],
[ 29. ],
[ nan],
[ 32. ],
[ 62. ],
[ 53. ],
[ 36. ],
[ nan],
[ 16. ],
[ 19. ],
[ 34. ],
[ 39. ],
[ nan],
[ 32. ],
[ 25. ],
[ 39. ],
[ 54. ],
[ 36. ],
[ nan],
[ 18. ],
[ 47. ],
[ 60. ],
[ 22. ],
[ nan],
[ 35. ],
[ 52. ],
[ 47. ],
[ nan],
[ 37. ],
[ 36. ],
[ nan],
[ 49. ],
[ nan],
[ 49. ],
[ 24. ],
[ nan],
[ nan],
[ 44. ],
[ 35. ],
[ 36. ],
[ 30. ],
[ 27. ],
[ 22. ],
[ 40. ],
[ 39. ],
[ nan],
[ nan],
[ nan],
[ 35. ],
[ 24. ],
[ 34. ],
[ 26. ],
[ 4. ],
[ 26. ],
[ 27. ],
[ 42. ],
[ 20. ],
[ 21. ],
[ 21. ],
[ 61. ],
[ 57. ],
[ 21. ],
[ 26. ],
[ nan],
[ 80. ],
[ 51. ],
[ 32. ],
[ nan],
[ 9. ],
[ 28. ],
[ 32. ],
[ 31. ],
[ 41. ],
[ nan],
[ 20. ],
[ 24. ],
[ 2. ],
[ nan],
[ 0.75],
[ 48. ],
[ 19. ],
[ 56. ],
[ nan],
[ 23. ],
[ nan],
[ 18. ],
[ 21. ],
[ nan],
[ 18. ],
[ 24. ],
[ nan],
[ 32. ],
[ 23. ],
[ 58. ],
[ 50. ],
[ 40. ],
[ 47. ],
[ 36. ],
[ 20. ],
[ 32. ],
[ 25. ],
[ nan],
[ 43. ],
[ nan],
[ 40. ],
[ 31. ],
[ 70. ],
[ 31. ],
[ nan],
[ 18. ],
[ 24.5 ],
[ 18. ],
[ 43. ],
[ 36. ],
[ nan],
[ 27. ],
[ 20. ],
[ 14. ],
[ 60. ],
[ 25. ],
[ 14. ],
[ 19. ],
[ 18. ],
[ 15. ],
[ 31. ],
[ 4. ],
[ nan],
[ 25. ],
[ 60. ],
[ 52. ],
[ 44. ],
[ nan],
[ 49. ],
[ 42. ],
[ 18. ],
[ 35. ],
[ 18. ],
[ 25. ],
[ 26. ],
[ 39. ],
[ 45. ],
[ 42. ],
[ 22. ],
[ nan],
[ 24. ],
[ nan],
[ 48. ],
[ 29. ],
[ 52. ],
[ 19. ],
[ 38. ],
[ 27. ],
[ nan],
[ 33. ],
[ 6. ],
[ 17. ],
[ 34. ],
[ 50. ],
[ 27. ],
[ 20. ],
[ 30. ],
[ nan],
[ 25. ],
[ 25. ],
[ 29. ],
[ 11. ],
[ nan],
[ 23. ],
[ 23. ],
[ 28.5 ],
[ 48. ],
[ 35. ],
[ nan],
[ nan],
[ nan],
[ 36. ],
[ 21. ],
[ 24. ],
[ 31. ],
[ 70. ],
[ 16. ],
[ 30. ],
[ 19. ],
[ 31. ],
[ 4. ],
[ 6. ],
[ 33. ],
[ 23. ],
[ 48. ],
[ 0.67],
[ 28. ],
[ 18. ],
[ 34. ],
[ 33. ],
[ nan],
[ 41. ],
[ 20. ],
[ 36. ],
[ 16. ],
[ 51. ],
[ nan],
[ 30.5 ],
[ nan],
[ 32. ],
[ 24. ],
[ 48. ],
[ 57. ],
[ nan],
[ 54. ],
[ 18. ],
[ nan],
[ 5. ],
[ nan],
[ 43. ],
[ 13. ],
[ 17. ],
[ 29. ],
[ nan],
[ 25. ],
[ 25. ],
[ 18. ],
[ 8. ],
[ 1. ],
[ 46. ],
[ nan],
[ 16. ],
[ nan],
[ nan],
[ 25. ],
[ 39. ],
[ 49. ],
[ 31. ],
[ 30. ],
[ 30. ],
[ 34. ],
[ 31. ],
[ 11. ],
[ 0.42],
[ 27. ],
[ 31. ],
[ 39. ],
[ 18. ],
[ 39. ],
[ 33. ],
[ 26. ],
[ 39. ],
[ 35. ],
[ 6. ],
[ 30.5 ],
[ nan],
[ 23. ],
[ 31. ],
[ 43. ],
[ 10. ],
[ 52. ],
[ 27. ],
[ 38. ],
[ 27. ],
[ 2. ],
[ nan],
[ nan],
[ 1. ],
[ nan],
[ 62. ],
[ 15. ],
[ 0.83],
[ nan],
[ 23. ],
[ 18. ],
[ 39. ],
[ 21. ],
[ nan],
[ 32. ],
[ nan],
[ 20. ],
[ 16. ],
[ 30. ],
[ 34.5 ],
[ 17. ],
[ 42. ],
[ nan],
[ 35. ],
[ 28. ],
[ nan],
[ 4. ],
[ 74. ],
[ 9. ],
[ 16. ],
[ 44. ],
[ 18. ],
[ 45. ],
[ 51. ],
[ 24. ],
[ nan],
[ 41. ],
[ 21. ],
[ 48. ],
[ nan],
[ 24. ],
[ 42. ],
[ 27. ],
[ 31. ],
[ nan],
[ 4. ],
[ 26. ],
[ 47. ],
[ 33. ],
[ 47. ],
[ 28. ],
[ 15. ],
[ 20. ],
[ 19. ],
[ nan],
[ 56. ],
[ 25. ],
[ 33. ],
[ 22. ],
[ 28. ],
[ 25. ],
[ 39. ],
[ 27. ],
[ 19. ],
[ nan],
[ 26. ],
[ 32. ]])
from sklearn.preprocessing import Imputer
help(Imputer)
Help on class Imputer in module sklearn.preprocessing.imputation:
class Imputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
| Imputation transformer for completing missing values.
|
| Read more in the :ref:`User Guide `.
|
| Parameters
| ----------
| missing_values : integer or "NaN", optional (default="NaN")
| The placeholder for the missing values. All occurrences of
| `missing_values` will be imputed. For missing values encoded as np.nan,
| use the string value "NaN".
|
| strategy : string, optional (default="mean")
| The imputation strategy.
|
| - If "mean", then replace missing values using the mean along
| the axis.
| - If "median", then replace missing values using the median along
| the axis.
| - If "most_frequent", then replace missing using the most frequent
| value along the axis.
|
| axis : integer, optional (default=0)
| The axis along which to impute.
|
| - If `axis=0`, then impute along columns.
| - If `axis=1`, then impute along rows.
|
| verbose : integer, optional (default=0)
| Controls the verbosity of the imputer.
|
| copy : boolean, optional (default=True)
| If True, a copy of X will be created. If False, imputation will
| be done in-place whenever possible. Note that, in the following cases,
| a new copy will always be made, even if `copy=False`:
|
| - If X is not an array of floating values;
| - If X is sparse and `missing_values=0`;
| - If `axis=0` and X is encoded as a CSR matrix;
| - If `axis=1` and X is encoded as a CSC matrix.
|
| Attributes
| ----------
| statistics_ : array of shape (n_features,)
| The imputation fill value for each feature if axis == 0.
|
| Notes
| -----
| - When ``axis=0``, columns which only contained missing values at `fit`
| are discarded upon `transform`.
| - When ``axis=1``, an exception is raised if there are rows for which it is
| not possible to fill in the missing values (e.g., because they only
| contain missing values).
|
| Method resolution order:
| Imputer
| sklearn.base.BaseEstimator
| sklearn.base.TransformerMixin
| builtins.object
|
| Methods defined here:
|
| __init__(self, missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
| Initialize self. See help(type(self)) for accurate signature.
|
| fit(self, X, y=None)
| Fit the imputer on X.
|
| Parameters
| ----------
| X : {array-like, sparse matrix}, shape (n_samples, n_features)
| Input data, where ``n_samples`` is the number of samples and
| ``n_features`` is the number of features.
|
| Returns
| -------
| self : object
| Returns self.
|
| transform(self, X)
| Impute all missing values in X.
|
| Parameters
| ----------
| X : {array-like, sparse matrix}, shape = [n_samples, n_features]
| The input data to complete.
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.BaseEstimator:
|
| __getstate__(self)
|
| __repr__(self)
| Return repr(self).
|
| __setstate__(self, state)
|
| get_params(self, deep=True)
| Get parameters for this estimator.
|
| Parameters
| ----------
| deep : boolean, optional
| If True, will return the parameters for this estimator and
| contained subobjects that are estimators.
|
| Returns
| -------
| params : mapping of string to any
| Parameter names mapped to their values.
|
| set_params(self, **params)
| Set the parameters of this estimator.
|
| The method works on simple estimators as well as on nested objects
| (such as pipelines). The latter have parameters of the form
| ``__`` so that it's possible to update each
| component of a nested object.
|
| Returns
| -------
| self
|
| ----------------------------------------------------------------------
| Data descriptors inherited from sklearn.base.BaseEstimator:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
|
| ----------------------------------------------------------------------
| Methods inherited from sklearn.base.TransformerMixin:
|
| fit_transform(self, X, y=None, **fit_params)
| Fit to data, then transform it.
|
| Fits transformer to X and y with optional parameters fit_params
| and returns a transformed version of X.
|
| Parameters
| ----------
| X : numpy array of shape [n_samples, n_features]
| Training set.
|
| y : numpy array of shape [n_samples]
| Target values.
|
| Returns
| -------
| X_new : numpy array of shape [n_samples, n_features_new]
| Transformed array.
impu = Imputer(missing_values='NaN',strategy='mean',axis=0)
age =impu.fit_transform(df_train[['Age']]) #df_obj.values 查看数据值 需要dataframe格式 .values有没有都一样
age
array([[ 22. ],
[ 38. ],
[ 26. ],
[ 35. ],
[ 35. ],
[ 29.69911765],
[ 54. ],
[ 2. ],
[ 27. ],
[ 14. ],
[ 4. ],
[ 58. ],
[ 20. ],
[ 39. ],
[ 14. ],
[ 55. ],
[ 2. ],
[ 29.69911765],
[ 31. ],
[ 29.69911765],
[ 35. ],
[ 34. ],
[ 15. ],
[ 28. ],
[ 8. ],
[ 38. ],
[ 29.69911765],
[ 19. ],
[ 29.69911765],
[ 29.69911765],
[ 40. ],
[ 29.69911765],
[ 29.69911765],
[ 66. ],
[ 28. ],
[ 42. ],
[ 29.69911765],
[ 21. ],
[ 18. ],
[ 14. ],
[ 40. ],
[ 27. ],
[ 29.69911765],
[ 3. ],
[ 19. ],
[ 29.69911765],
[ 29.69911765],
[ 29.69911765],
[ 29.69911765],
[ 18. ],
[ 7. ],
[ 21. ],
[ 49. ],
[ 29. ],
[ 65. ],
[ 29.69911765],
[ 21. ],
[ 28.5 ],
[ 5. ],
[ 11. ],
[ 22. ],
[ 38. ],
[ 45. ],
[ 4. ],
[ 29.69911765],
[ 29.69911765],
[ 29. ],
[ 19. ],
[ 17. ],
[ 26. ],
[ 32. ],
[ 16. ],
[ 21. ],
[ 26. ],
[ 32. ],
[ 25. ],
[ 29.69911765],
[ 29.69911765],
[ 0.83 ],
[ 30. ],
[ 22. ],
[ 29. ],
[ 29.69911765],
[ 28. ],
[ 17. ],
[ 33. ],
[ 16. ],
[ 29.69911765],
[ 23. ],
[ 24. ],
[ 29. ],
[ 20. ],
[ 46. ],
[ 26. ],
[ 59. ],
[ 29.69911765],
[ 71. ],
[ 23. ],
[ 34. ],
[ 34. ],
[ 28. ],
[ 29.69911765],
[ 21. ],
[ 33. ],
[ 37. ],
[ 28. ],
[ 21. ],
[ 29.69911765],
[ 38. ],
[ 29.69911765],
[ 47. ],
[ 14.5 ],
[ 22. ],
[ 20. ],
[ 17. ],
[ 21. ],
[ 70.5 ],
[ 29. ],
[ 24. ],
[ 2. ],
[ 21. ],
[ 29.69911765],
[ 32.5 ],
[ 32.5 ],
[ 54. ],
[ 12. ],
[ 29.69911765],
[ 24. ],
[ 29.69911765],
[ 45. ],
[ 33. ],
[ 20. ],
[ 47. ],
[ 29. ],
[ 25. ],
[ 23. ],
[ 19. ],
[ 37. ],
[ 16. ],
[ 24. ],
[ 29.69911765],
[ 22. ],
[ 24. ],
[ 19. ],
[ 18. ],
[ 19. ],
[ 27. ],
[ 9. ],
[ 36.5 ],
[ 42. ],
[ 51. ],
[ 22. ],
[ 55.5 ],
[ 40.5 ],
[ 29.69911765],
[ 51. ],
[ 16. ],
[ 30. ],
[ 29.69911765],
[ 29.69911765],
[ 44. ],
[ 40. ],
[ 26. ],
[ 17. ],
[ 1. ],
[ 9. ],
[ 29.69911765],
[ 45. ],
[ 29.69911765],
[ 28. ],
[ 61. ],
[ 4. ],
[ 1. ],
[ 21. ],
[ 56. ],
[ 18. ],
[ 29.69911765],
[ 50. ],
[ 30. ],
[ 36. ],
[ 29.69911765],
[ 29.69911765],
[ 9. ],
[ 1. ],
[ 4. ],
[ 29.69911765],
[ 29.69911765],
[ 45. ],
[ 40. ],
[ 36. ],
[ 32. ],
[ 19. ],
[ 19. ],
[ 3. ],
[ 44. ],
[ 58. ],
[ 29.69911765],
[ 42. ],
[ 29.69911765],
[ 24. ],
[ 28. ],
[ 29.69911765],
[ 34. ],
[ 45.5 ],
[ 18. ],
[ 2. ],
[ 32. ],
[ 26. ],
[ 16. ],
[ 40. ],
[ 24. ],
[ 35. ],
[ 22. ],
[ 30. ],
[ 29.69911765],
[ 31. ],
[ 27. ],
[ 42. ],
[ 32. ],
[ 30. ],
[ 16. ],
[ 27. ],
[ 51. ],
[ 29.69911765],
[ 38. ],
[ 22. ],
[ 19. ],
[ 20.5 ],
[ 18. ],
[ 29.69911765],
[ 35. ],
[ 29. ],
[ 59. ],
[ 5. ],
[ 24. ],
[ 29.69911765],
[ 44. ],
[ 8. ],
[ 19. ],
[ 33. ],
[ 29.69911765],
[ 29.69911765],
[ 29. ],
[ 22. ],
[ 30. ],
[ 44. ],
[ 25. ],
[ 24. ],
[ 37. ],
[ 54. ],
[ 29.69911765],
[ 29. ],
[ 62. ],
[ 30. ],
[ 41. ],
[ 29. ],
[ 29.69911765],
[ 30. ],
[ 35. ],
[ 50. ],
[ 29.69911765],
[ 3. ],
[ 52. ],
[ 40. ],
[ 29.69911765],
[ 36. ],
[ 16. ],
[ 25. ],
[ 58. ],
[ 35. ],
[ 29.69911765],
[ 25. ],
[ 41. ],
[ 37. ],
[ 29.69911765],
[ 63. ],
[ 45. ],
[ 29.69911765],
[ 7. ],
[ 35. ],
[ 65. ],
[ 28. ],
[ 16. ],
[ 19. ],
[ 29.69911765],
[ 33. ],
[ 30. ],
[ 22. ],
[ 42. ],
[ 22. ],
[ 26. ],
[ 19. ],
[ 36. ],
[ 24. ],
[ 24. ],
[ 29.69911765],
[ 23.5 ],
[ 2. ],
[ 29.69911765],
[ 50. ],
[ 29.69911765],
[ 29.69911765],
[ 19. ],
[ 29.69911765],
[ 29.69911765],
[ 0.92 ],
[ 29.69911765],
[ 17. ],
[ 30. ],
[ 30. ],
[ 24. ],
[ 18. ],
[ 26. ],
[ 28. ],
[ 43. ],
[ 26. ],
[ 24. ],
[ 54. ],
[ 31. ],
[ 40. ],
[ 22. ],
[ 27. ],
[ 30. ],
[ 22. ],
[ 29.69911765],
[ 36. ],
[ 61. ],
[ 36. ],
[ 31. ],
[ 16. ],
[ 29.69911765],
[ 45.5 ],
[ 38. ],
[ 16. ],
[ 29.69911765],
[ 29.69911765],
[ 29. ],
[ 41. ],
[ 45. ],
[ 45. ],
[ 2. ],
[ 24. ],
[ 28. ],
[ 25. ],
[ 36. ],
[ 24. ],
[ 40. ],
[ 29.69911765],
[ 3. ],
[ 42. ],
[ 23. ],
[ 29.69911765],
[ 15. ],
[ 25. ],
[ 29.69911765],
[ 28. ],
[ 22. ],
[ 38. ],
[ 29.69911765],
[ 29.69911765],
[ 40. ],
[ 29. ],
[ 45. ],
[ 35. ],
[ 29.69911765],
[ 30. ],
[ 60. ],
[ 29.69911765],
[ 29.69911765],
[ 24. ],
[ 25. ],
[ 18. ],
[ 19. ],
[ 22. ],
[ 3. ],
[ 29.69911765],
[ 22. ],
[ 27. ],
[ 20. ],
[ 19. ],
[ 42. ],
[ 1. ],
[ 32. ],
[ 35. ],
[ 29.69911765],
[ 18. ],
[ 1. ],
[ 36. ],
[ 29.69911765],
[ 17. ],
[ 36. ],
[ 21. ],
[ 28. ],
[ 23. ],
[ 24. ],
[ 22. ],
[ 31. ],
[ 46. ],
[ 23. ],
[ 28. ],
[ 39. ],
[ 26. ],
[ 21. ],
[ 28. ],
[ 20. ],
[ 34. ],
[ 51. ],
[ 3. ],
[ 21. ],
[ 29.69911765],
[ 29.69911765],
[ 29.69911765],
[ 33. ],
[ 29.69911765],
[ 44. ],
[ 29.69911765],
[ 34. ],
[ 18. ],
[ 30. ],
[ 10. ],
[ 29.69911765],
[ 21. ],
[ 29. ],
[ 28. ],
[ 18. ],
[ 29.69911765],
[ 28. ],
[ 19. ],
[ 29.69911765],
[ 32. ],
[ 28. ],
[ 29.69911765],
[ 42. ],
[ 17. ],
[ 50. ],
[ 14. ],
[ 21. ],
[ 24. ],
[ 64. ],
[ 31. ],
[ 45. ],
[ 20. ],
[ 25. ],
[ 28. ],
[ 29.69911765],
[ 4. ],
[ 13. ],
[ 34. ],
[ 5. ],
[ 52. ],
[ 36. ],
[ 29.69911765],
[ 30. ],
[ 49. ],
[ 29.69911765],
[ 29. ],
[ 65. ],
[ 29.69911765],
[ 50. ],
[ 29.69911765],
[ 48. ],
[ 34. ],
[ 47. ],
[ 48. ],
[ 29.69911765],
[ 38. ],
[ 29.69911765],
[ 56. ],
[ 29.69911765],
[ 0.75 ],
[ 29.69911765],
[ 38. ],
[ 33. ],
[ 23. ],
[ 22. ],
[ 29.69911765],
[ 34. ],
[ 29. ],
[ 22. ],
[ 2. ],
[ 9. ],
[ 29.69911765],
[ 50. ],
[ 63. ],
[ 25. ],
[ 29.69911765],
[ 35. ],
[ 58. ],
[ 30. ],
[ 9. ],
[ 29.69911765],
[ 21. ],
[ 55. ],
[ 71. ],
[ 21. ],
[ 29.69911765],
[ 54. ],
[ 29.69911765],
[ 25. ],
[ 24. ],
[ 17. ],
[ 21. ],
[ 29.69911765],
[ 37. ],
[ 16. ],
[ 18. ],
[ 33. ],
[ 29.69911765],
[ 28. ],
[ 26. ],
[ 29. ],
[ 29.69911765],
[ 36. ],
[ 54. ],
[ 24. ],
[ 47. ],
[ 34. ],
[ 29.69911765],
[ 36. ],
[ 32. ],
[ 30. ],
[ 22. ],
[ 29.69911765],
[ 44. ],
[ 29.69911765],
[ 40.5 ],
[ 50. ],
[ 29.69911765],
[ 39. ],
[ 23. ],
[ 2. ],
[ 29.69911765],
[ 17. ],
[ 29.69911765],
[ 30. ],
[ 7. ],
[ 45. ],
[ 30. ],
[ 29.69911765],
[ 22. ],
[ 36. ],
[ 9. ],
[ 11. ],
[ 32. ],
[ 50. ],
[ 64. ],
[ 19. ],
[ 29.69911765],
[ 33. ],
[ 8. ],
[ 17. ],
[ 27. ],
[ 29.69911765],
[ 22. ],
[ 22. ],
[ 62. ],
[ 48. ],
[ 29.69911765],
[ 39. ],
[ 36. ],
[ 29.69911765],
[ 40. ],
[ 28. ],
[ 29.69911765],
[ 29.69911765],
[ 24. ],
[ 19. ],
[ 29. ],
[ 29.69911765],
[ 32. ],
[ 62. ],
[ 53. ],
[ 36. ],
[ 29.69911765],
[ 16. ],
[ 19. ],
[ 34. ],
[ 39. ],
[ 29.69911765],
[ 32. ],
[ 25. ],
[ 39. ],
[ 54. ],
[ 36. ],
[ 29.69911765],
[ 18. ],
[ 47. ],
[ 60. ],
[ 22. ],
[ 29.69911765],
[ 35. ],
[ 52. ],
[ 47. ],
[ 29.69911765],
[ 37. ],
[ 36. ],
[ 29.69911765],
[ 49. ],
[ 29.69911765],
[ 49. ],
[ 24. ],
[ 29.69911765],
[ 29.69911765],
[ 44. ],
[ 35. ],
[ 36. ],
[ 30. ],
[ 27. ],
[ 22. ],
[ 40. ],
[ 39. ],
[ 29.69911765],
[ 29.69911765],
[ 29.69911765],
[ 35. ],
[ 24. ],
[ 34. ],
[ 26. ],
[ 4. ],
[ 26. ],
[ 27. ],
[ 42. ],
[ 20. ],
[ 21. ],
[ 21. ],
[ 61. ],
[ 57. ],
[ 21. ],
[ 26. ],
[ 29.69911765],
[ 80. ],
[ 51. ],
[ 32. ],
[ 29.69911765],
[ 9. ],
[ 28. ],
[ 32. ],
[ 31. ],
[ 41. ],
[ 29.69911765],
[ 20. ],
[ 24. ],
[ 2. ],
[ 29.69911765],
[ 0.75 ],
[ 48. ],
[ 19. ],
[ 56. ],
[ 29.69911765],
[ 23. ],
[ 29.69911765],
[ 18. ],
[ 21. ],
[ 29.69911765],
[ 18. ],
[ 24. ],
[ 29.69911765],
[ 32. ],
[ 23. ],
[ 58. ],
[ 50. ],
[ 40. ],
[ 47. ],
[ 36. ],
[ 20. ],
[ 32. ],
[ 25. ],
[ 29.69911765],
[ 43. ],
[ 29.69911765],
[ 40. ],
[ 31. ],
[ 70. ],
[ 31. ],
[ 29.69911765],
[ 18. ],
[ 24.5 ],
[ 18. ],
[ 43. ],
[ 36. ],
[ 29.69911765],
[ 27. ],
[ 20. ],
[ 14. ],
[ 60. ],
[ 25. ],
[ 14. ],
[ 19. ],
[ 18. ],
[ 15. ],
[ 31. ],
[ 4. ],
[ 29.69911765],
[ 25. ],
[ 60. ],
[ 52. ],
[ 44. ],
[ 29.69911765],
[ 49. ],
[ 42. ],
[ 18. ],
[ 35. ],
[ 18. ],
[ 25. ],
[ 26. ],
[ 39. ],
[ 45. ],
[ 42. ],
[ 22. ],
[ 29.69911765],
[ 24. ],
[ 29.69911765],
[ 48. ],
[ 29. ],
[ 52. ],
[ 19. ],
[ 38. ],
[ 27. ],
[ 29.69911765],
[ 33. ],
[ 6. ],
[ 17. ],
[ 34. ],
[ 50. ],
[ 27. ],
[ 20. ],
[ 30. ],
[ 29.69911765],
[ 25. ],
[ 25. ],
[ 29. ],
[ 11. ],
[ 29.69911765],
[ 23. ],
[ 23. ],
[ 28.5 ],
[ 48. ],
[ 35. ],
[ 29.69911765],
[ 29.69911765],
[ 29.69911765],
[ 36. ],
[ 21. ],
[ 24. ],
[ 31. ],
[ 70. ],
[ 16. ],
[ 30. ],
[ 19. ],
[ 31. ],
[ 4. ],
[ 6. ],
[ 33. ],
[ 23. ],
[ 48. ],
[ 0.67 ],
[ 28. ],
[ 18. ],
[ 34. ],
[ 33. ],
[ 29.69911765],
[ 41. ],
[ 20. ],
[ 36. ],
[ 16. ],
[ 51. ],
[ 29.69911765],
[ 30.5 ],
[ 29.69911765],
[ 32. ],
[ 24. ],
[ 48. ],
[ 57. ],
[ 29.69911765],
[ 54. ],
[ 18. ],
[ 29.69911765],
[ 5. ],
[ 29.69911765],
[ 43. ],
[ 13. ],
[ 17. ],
[ 29. ],
[ 29.69911765],
[ 25. ],
[ 25. ],
[ 18. ],
[ 8. ],
[ 1. ],
[ 46. ],
[ 29.69911765],
[ 16. ],
[ 29.69911765],
[ 29.69911765],
[ 25. ],
[ 39. ],
[ 49. ],
[ 31. ],
[ 30. ],
[ 30. ],
[ 34. ],
[ 31. ],
[ 11. ],
[ 0.42 ],
[ 27. ],
[ 31. ],
[ 39. ],
[ 18. ],
[ 39. ],
[ 33. ],
[ 26. ],
[ 39. ],
[ 35. ],
[ 6. ],
[ 30.5 ],
[ 29.69911765],
[ 23. ],
[ 31. ],
[ 43. ],
[ 10. ],
[ 52. ],
[ 27. ],
[ 38. ],
[ 27. ],
[ 2. ],
[ 29.69911765],
[ 29.69911765],
[ 1. ],
[ 29.69911765],
[ 62. ],
[ 15. ],
[ 0.83 ],
[ 29.69911765],
[ 23. ],
[ 18. ],
[ 39. ],
[ 21. ],
[ 29.69911765],
[ 32. ],
[ 29.69911765],
[ 20. ],
[ 16. ],
[ 30. ],
[ 34.5 ],
[ 17. ],
[ 42. ],
[ 29.69911765],
[ 35. ],
[ 28. ],
[ 29.69911765],
[ 4. ],
[ 74. ],
[ 9. ],
[ 16. ],
[ 44. ],
[ 18. ],
[ 45. ],
[ 51. ],
[ 24. ],
[ 29.69911765],
[ 41. ],
[ 21. ],
[ 48. ],
[ 29.69911765],
[ 24. ],
[ 42. ],
[ 27. ],
[ 31. ],
[ 29.69911765],
[ 4. ],
[ 26. ],
[ 47. ],
[ 33. ],
[ 47. ],
[ 28. ],
[ 15. ],
[ 20. ],
[ 19. ],
[ 29.69911765],
[ 56. ],
[ 25. ],
[ 33. ],
[ 22. ],
[ 28. ],
[ 25. ],
[ 39. ],
[ 27. ],
[ 19. ],
[ 29.69911765],
[ 26. ],
[ 32. ]])
import numpy as np
log_age = df_train[['Age']].apply(lambda x:np.log(x))
log_age
Age
0 3.091042
1 3.637586
2 3.258097
3 3.555348
4 3.555348
5 NaN
6 3.988984
7 0.693147
8 3.295837
9 2.639057
10 1.386294
11 4.060443
12 2.995732
13 3.663562
14 2.639057
15 4.007333
16 0.693147
17 NaN
18 3.433987
19 NaN
20 3.555348
21 3.526361
22 2.708050
23 3.332205
24 2.079442
25 3.637586
26 NaN
27 2.944439
28 NaN
29 NaN
... ...
861 3.044522
862 3.871201
863 NaN
864 3.178054
865 3.737670
866 3.295837
867 3.433987
868 NaN
869 1.386294
870 3.258097
871 3.850148
872 3.496508
873 3.850148
874 3.332205
875 2.708050
876 2.995732
877 2.944439
878 NaN
879 4.025352
880 3.218876
881 3.496508
882 3.091042
883 3.332205
884 3.218876
885 3.663562
886 3.295837
887 2.944439
888 NaN
889 3.258097
890 3.465736
891 rows × 1 columns
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit_transform(df_train[['Fare']]) #加2个中括号是把里面每个数都变成一个向量,那么整体就是array的格式
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
SS.fit_transform(df_train[['Fare']])
max_age = df_train['Fare'].max()
print(max_age)
512.3292
min_age = df_train[['Age']].min()
print(min_age)
Age 0.42
dtype: float64
df_train.loc[:,'family_size'] = df_train['SibSp'] + df_train['Parch'] + 1
df_train.head(10)
df_train['family_size']
from sklearn.preprocessing import PolynomialFeatures
PnF = PolynomialFeatures()
Poly_fit = PnF.fit_transform(df_train[['SibSp']],df_train[['Parch']])
Poly_fit
array([[ 1., 1., 1.],
[ 1., 1., 1.],
[ 1., 0., 0.],
...,
[ 1., 1., 1.],
[ 1., 0., 0.],
[ 1., 0., 0.]])
#cut将根据值本身来选择箱子均匀间隔,等长划分,qcut是根据这些值的频率来选择箱子的均匀间隔,等比划分。
cutdata = pd.qcut(df_train['Age'],8)
cutdata
df_train.loc[:,'fare_cut'] = pd.cut(df_train['Fare'],5) #里面应该是一个一维的类数组对象
df_train.head(30)
df_train.info()
#one hot 处理
dfg = df_train.groupby(df_train['Embarked'])
dfg.describe() #Embarled中有c , q ,s 三个分类
#one hot 处理
embark_one_hot = pd.get_dummies(df_train['Embarked'])
embark_one_hot
car_time = pd.read_csv('car_data.csv')
car_time.head(10)
car_time.loc[:,"date"] = pd.to_datetime(car_time["date_t"],format="") #把object类型的date转成datetime的时间类型
car_time.head()
date_t cnt date month
0 2012-12-31 NaN 2012-12-31 12
1 2013-01-01 NaN 2013-01-01 1
2 2013-01-02 68.0 2013-01-02 1
3 2013-01-03 36.0 2013-01-03 1
4 2013-01-04 5565.0 2013-01-04 1
car_time.loc[:,"month"] = car_time["date"].dt.month
car_time.loc[:,"dom"] = car_time["date"].dt.day
car_time.head()
date_t cnt date month dom
0 2012-12-31 NaN 2012-12-31 12 31
1 2013-01-01 NaN 2013-01-01 1 1
2 2013-01-02 68.0 2013-01-02 1 2
3 2013-01-03 36.0 2013-01-03 1 3
4 2013-01-04 5565.0 2013-01-04 1 4
car_time.loc[:,"dow"] = car_time["date"].dt.dayofweek
car_time.loc[:,"weekend"] = car_time["dow"].apply(lambda x: 1 if (x == 6 or x == 1) else 0) #不能用 lambda x:x==1,右边是个函数,这个函数输出true
car_time.head(10)
date_t cnt date month dom dow weekend
0 2012-12-31 NaN 2012-12-31 12 31 0 0
1 2013-01-01 NaN 2013-01-01 1 1 1 1
2 2013-01-02 68.0 2013-01-02 1 2 2 0
3 2013-01-03 36.0 2013-01-03 1 3 3 0
4 2013-01-04 5565.0 2013-01-04 1 4 4 0
5 2013-01-05 4966.0 2013-01-05 1 5 5 0
6 2013-01-06 3346.0 2013-01-06 1 6 6 1
7 2013-01-07 3396.0 2013-01-07 1 7 0 0
8 2013-01-08 4146.0 2013-01-08 1 8 1 1
9 2013-01-09 3096.0 2013-01-09 1 9 2 0
#特殊类型,文本型
#1. 词袋模型
from sklearn.feature_extraction.text import CountVectorizer #计数器
verctorize = CountVectorizer() #初始化CountVectorizer这个类,这个类init不用传入参数
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?'
]
X = verctorize.fit_transform(corpus)
verctorize.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
X.toarray() #['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] 一一对应出现的次数
#总之,CountVectorizer 计算每个词在句子中出现的次数,并且形成向量化的形式,每个colum对应一个词,1代表and词在第一列出现了一次。
array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
[0, 1, 0, 1, 0, 2, 1, 0, 1],
[1, 0, 0, 0, 1, 0, 1, 1, 0],
[0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)
vec = CountVectorizer(ngram_range=(1,3)) #把关键字组合起来看次数
X_ngram = vec.fit_transform(corpus)
X_ngram.toarray()
array([[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 1, 1, 1, 0, 0],
[0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, 1, 1,
0, 0, 0, 0, 1, 1, 1, 0, 0],
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1]], dtype=int64)
###TF-IDF 带权重的
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()
tfid_X =tfid.fit_transform(corpus)
tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
tfid_X.toarray() #告诉哪个词在第几个句子所占的比重是多少
array([[ 0. , 0.43877674, 0.54197657, 0.43877674, 0. ,
0. , 0.35872874, 0. , 0.43877674],
[ 0. , 0.27230147, 0. , 0.27230147, 0. ,
0.85322574, 0.22262429, 0. , 0.27230147],
[ 0.55280532, 0. , 0. , 0. , 0.55280532,
0. , 0.28847675, 0.55280532, 0. ],
[ 0. , 0.43877674, 0.54197657, 0.43877674, 0. ,
0. , 0.35872874, 0. , 0.43877674]])
tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked family_size fare_cut
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 (-0.512, 102.466]
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 (-0.512, 102.466]
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 1 (-0.512, 102.466]
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 2 (-0.512, 102.466]
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1 (-0.512, 102.466]
#借助条件去获取组合特征
df_train.loc[:,"alone"] = (df_train['SibSp']==0)&(df_train['Parch']==0)
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked family_size fare_cut alone
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 (-0.512, 102.466] False
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 (-0.512, 102.466] False
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 1 (-0.512, 102.466] True
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 2 (-0.512, 102.466] False
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1 (-0.512, 102.466] True
#过滤式选择更加好的特征
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
iris = load_iris()
X, y = iris.data,iris.target
X
X_new= SelectKBest(k=2).fit_transform(X,y) #选2个特征,找到x中的两个特征,那么这2个特征就是更符合分类(也是y值)的特征。计算相关系数。
X_new.shape
(150, 2)
X_new
#递归的特征筛选
#包裹型 wrapper
from sklearn.feature_selection import RFE #递归的特征筛选
from sklearn.ensemble import RandomForestClassifier #用rf去判定选择特征重要度
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=2)
X_rfe = rfe.fit_transform(X,y)
X_rfe.shape
(150, 2)
X_rfe[:5,:] #最后2列
array([[ 1.4, 0.2],
[ 1.4, 0.2],
[ 1.3, 0.2],
[ 1.5, 0.2],
[ 1.4, 0.2]])
#需要线性模型
#嵌入式
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC #需要线性模型
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) #l1正则化
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) #l1正则化
model = SelectFromModel(lsvc, prefit=True)
X_embed = model.transform(X)
X_embed.shape
(150, 3)