def getres1(row):
return len([x for x in row.values if type(x)==int and x<0])
data['neg1'] = data[data.columns].apply(lambda row:getres1(row),axis=1)
data.loc[data['neg1']>20,'neg1'] = 20 #平滑处理
缺失值也可以删去样本或者特征,不过在样本量比较少,特征不明确的情况下不建议直接删除特征。
可以直接用fillna补充
data['leisure_4'] = data['leisure_4'].fillna(“0”)
data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode()
family_income_mean = data['family_income'].mean()
data.loc[data['family_income']<0,'family_income'] = family_income_mean
data.loc[data['f_work_14']<0,'f_work_14'] = 2
取众数和均值是可以互换的 ,需要根据实际情况选取。
可以分析特征关系进行进一步增加,可以补充的值如果在国民幸福度总结中,可以包括悠闲指数、满意指数、信任指数,一些综合的,被前人研究出的一些指标。
比如说所在省份的平均收入
#province mean 168+13=181
data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').values
data['income/province'] = data['income']/(data['province_income_mean'])
删除缺失值较多的,或者过度偏态的特征
onehot_data = data[cat_fea].values
enc = preprocessing.OneHotEncoder(categories = 'auto')
oh_data=enc.fit_transform(onehot_data).toarray()
oh_data.shape #变为onehot编码格式
# 合并结果, noc_fea为除了热编码以外的特征集
X_train_383 = np.column_stack([data[:train_shape][noc_fea].values,X_train_oh])
X_test_383 = np.column_stack([data[train_shape:][noc_fea].values,X_test_oh])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
一般直接和其他的模型连用,合成pipeline
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train) # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression())])
pipe.score(X_test, y_test)
data['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d',errors='coerce')#防止时间格式不同的报错errors='coerce‘
data['survey_time'] = data['survey_time'].dt.year #仅仅是year,方便计算年龄
data['age'] = data['survey_time']-data['birth']
bins = [0,17,26,34,50,63,100]
data['age_bin'] = pd.cut(data['age'], bins, labels=[0,1,2,3,4,5])
线性模型:
sklearn链接 | code |
---|---|
逻辑回归 | from sklearn.linear_model import LogisticRegression |
感知机 | from sklearn.linear_model import Perceptron |
随机梯度下降的线性支持向量机:一般实现需要先将数据标准化 | from sklearn.linear_model import SGDClassifier |
贝叶斯岭回归 | from sklearn.linear_model import BayesianRidge as br |
核函数岭回归;原理 | from sklearn.kernel_ridge import KernelRidge as kr |
岭回归 | from sklearn.linear_model import Ridge |
Lasso回归 | from sklearn.linear_model import Lasso |
线性回归 | from sklearn.linear_model import LinearRegression as lr |
结合lasso和岭回归的方法 | from sklearn.linear_model import ElasticNet as en |
其他基础模型
sklearn 链接 | code |
---|---|
线性支持向量机 | from sklearn.svm import LinearSVC |
核支持向量机 | from sklearn.svm import SVC |
K邻近 | from sklearn.neighbors import KNeighborsClassifier |
高斯朴素贝叶斯 | from sklearn.naive_bayes import GaussianNB |
决策树 | from sklearn.tree import DecisionTreeClassifier |
集成学习
sklearn链接 | code |
---|---|
随机森林分类 | from sklearn.ensemble import RandomForestClassifier |
随机森林回归 | from sklearn.ensemble import RandomForestRegressor as rfr |
Extra-Trees(Extremely randomized trees,极端随机树);泛化能力更强 | from sklearn.ensemble import ExtraTreesRegressor as etr |
梯度下降的boosting算法 | from sklearn.ensemble import GradientBoostingRegressor as gbr |
boost集成方法使用除了sklearn中的方法,需要安装
xgboost,lightgbm更快
import lightgbm as lgb
import xgboost as xgb
lightgrm调参