MNL(使用自己的数据集)

1. 导入包

import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

2. 导入自己的数据

data_wide = pd.read_csv("./data/mode_wide.csv", index_col=0) #index_col=0 第一行为列名
data_wide
choice cost.car cost.carpool cost.bus cost.rail time.car time.carpool time.bus time.rail
1 car 1.507010 2.335612 1.800512 2.358920 18.503200 26.338233 20.867794 30.033469
2 rail 6.056998 2.896919 2.237128 1.855450 31.311107 34.256956 67.181889 60.293126
3 car 5.794677 2.137454 2.576385 2.747479 22.547429 23.255171 63.309057 49.171643
4 car 1.869144 2.572427 1.903518 2.268276 26.090282 29.896023 19.752704 13.472675
5 car 2.498952 1.722010 2.686000 2.973866 4.699140 12.414084 43.092039 39.743252
... ... ... ... ... ... ... ... ... ...
449 rail 6.990901 0.515137 2.066044 2.171174 48.022792 44.501577 27.271918 18.966319
450 car 4.591647 2.891148 1.900379 1.794407 29.444192 33.727087 66.117345 39.842459
451 car 3.236237 1.206815 1.754674 2.023671 16.349017 18.975074 23.387729 43.298276
452 bus 6.932740 1.171861 2.461495 2.612489 65.420641 60.481668 52.404315 48.370662
453 carpool 6.531509 1.408171 2.214791 1.856338 59.566073 55.141406 67.815635 73.447286

453 rows × 9 columns

2. 处理数据

y= 1(选car);

y = 2 (carpool);

y = 3 (rail);

y = 4 (bus);

def choice_to_y(choice):
    if choice == 'car':
        return 1
    elif choice == 'carpool':
        return 2
    elif choice == 'rail':
        return 3
    else:
        return 4

data_wide['y'] = data_wide['choice'].map(choice_to_y)
data_wide
choice cost.car cost.carpool cost.bus cost.rail time.car time.carpool time.bus time.rail y
1 car 1.507010 2.335612 1.800512 2.358920 18.503200 26.338233 20.867794 30.033469 1
2 rail 6.056998 2.896919 2.237128 1.855450 31.311107 34.256956 67.181889 60.293126 3
3 car 5.794677 2.137454 2.576385 2.747479 22.547429 23.255171 63.309057 49.171643 1
4 car 1.869144 2.572427 1.903518 2.268276 26.090282 29.896023 19.752704 13.472675 1
5 car 2.498952 1.722010 2.686000 2.973866 4.699140 12.414084 43.092039 39.743252 1
... ... ... ... ... ... ... ... ... ... ...
449 rail 6.990901 0.515137 2.066044 2.171174 48.022792 44.501577 27.271918 18.966319 3
450 car 4.591647 2.891148 1.900379 1.794407 29.444192 33.727087 66.117345 39.842459 1
451 car 3.236237 1.206815 1.754674 2.023671 16.349017 18.975074 23.387729 43.298276 1
452 bus 6.932740 1.171861 2.461495 2.612489 65.420641 60.481668 52.404315 48.370662 4
453 carpool 6.531509 1.408171 2.214791 1.856338 59.566073 55.141406 67.815635 73.447286 2

453 rows × 10 columns

3. 确定自变量X和因变量y

data_wide.columns
Index(['choice', 'cost.car', 'cost.carpool', 'cost.bus', 'cost.rail',
       'time.car', 'time.carpool', 'time.bus', 'time.rail', 'y'],
      dtype='object')
X = data_wide[['cost.car', 'cost.carpool', 'cost.bus', 'cost.rail','time.car', 'time.carpool', 'time.bus', 'time.rail']]
y = data_wide['y']

4. 配置Logit模型并评估

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# define the model evaluation procedure (定义模型评估程序) n_splits 就是K-flods中的K值;n_repeats是交叉验证的次数
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores (评估模型并收集分数)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance 
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))  
Mean Accuracy: 0.665 (0.061)

5. 拟合

model.fit(X, y)
D:\ANACONDA\lib\site-packages\sklearn\linear_model\_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,





LogisticRegression(multi_class='multinomial')

6. 设置一个新的数据,预测结果

#生成一组新数据
new_data = np.random.rand(8)
new_data
array([0.11880174, 0.16505872, 0.14297278, 0.50355392, 0.87629855,
       0.91189688, 0.57073101, 0.19178997])
#预测
#预测新数据的分布概率
yhat = model.predict_proba([new_data])

#输出预测结果
print('Predicted Probabilities: %s' % yhat[0])
Predicted Probabilities: [0.3749058  0.20228137 0.20380141 0.21901142]


D:\ANACONDA\lib\site-packages\sklearn\base.py:451: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  "X does not have valid feature names, but"

已经可以了解如何使用自己的数据进行多元logit回归的一个思路;

上面的警告是出现了无效的特征名(列名不是正确的格式)

你可能感兴趣的:(数学模型,python)