import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from pandas import *
from test import *
from pandas import DataFrame
from sklearn.ensemble import RandomForestRegressor
import sklearn
import sklearn.linear_model
from sklearn import cross_validation
test_classifiers = ['XGBOOST']
RATIO = 0.8
def show_data_in_figure(data):
fig = plt.figure()
fig.set(alpha=0.2)
plt.subplot2grid((2,3),(0,0))
data['Survived'].value_counts().plot(kind='bar')
plt.title('alive status, 1 means alive')
plt.ylabel('number')
plt.subplot2grid((2,3),(0,1))
data['Pclass'].value_counts().plot(kind='bar')
plt.title('Pclass status')
plt.ylabel('number')
plt.subplot2grid((2,3),(0,2))
plt.scatter(data['Survived'],data['Age'])
plt.ylabel('Age')
plt.grid(b=True,which='major',axis='y')
plt.title('alive via age')
plt.subplot2grid((2,3),(1,0),colspan=2)
data.Age[data.Pclass==1].plot(kind='kde')
data.Age[data.Pclass==2].plot(kind='kde')
data.Age[data.Pclass==3].plot(kind='kde')
plt.xlabel('Age')
plt.ylabel('density')
plt.title('Pclass via age')
plt.legend(('1st,2nd,3rd'),loc='best')
plt.subplot2grid((2,3),(1,2))
data.Embarked.value_counts().plot(kind='bar')
plt.title('Embarked status')
plt.ylabel('number')
plt.show()
fig=plt.figure()
fig.set(alpha=0.2)
survived_0=data.Pclass[data.Survived==0].value_counts()
survived_1=data.Pclass[data.Survived==1].value_counts()
df=pandas.DataFrame({'alive':survived_1,'dead':survived_0})
df.plot(kind='bar',stacked=True)
plt.title('Pclass alive status')
plt.xlabel('Pclass')
plt.ylabel('number')
plt.show()
fig=plt.figure()
fig.set(alpha=0.2)
survived_m=data.Survived[data.Sex=='male'].value_counts()
survived_f=data.Survived[data.Sex=='female'].value_counts()
df=pandas.DataFrame({'male':survived_m,'female':survived_f})
df.plot(kind='bar',stacked=True)
plt.title('Sex alive status')
plt.xlabel('Sex')
plt.ylabel('number')
plt.show()
g=data.groupby(['SibSp','Survived'])
df=pandas.DataFrame(g.count()['PassengerId'])
g=data.groupby(['Parch','Survived'])
df = pandas.DataFrame(g.count()['PassengerId'])
print(data.Cabin.value_counts())
fig=plt.figure()
fig.set(alpha=0.2)
survived_cabin=data.Survived[pandas.notnull(data.Cabin)].value_counts()
survived_nocabin=data.Survived[pandas.isnull(data.Cabin)].value_counts()
df=pandas.DataFrame({'have':survived_cabin,'not have':survived_nocabin}).transpose()
df.plot(kind='bar',stacked=True)
plt.title('Cabin status')
plt.xlabel('have or not')
plt.ylabel('number')
plt.show()
def set_missing_ages(df):
df.loc[df.Fare.isnull(),'Fare']=0
age_df=df[['Age','Fare','Parch','SibSp','Pclass']]
known_age=age_df[age_df.Age.notnull()].as_matrix()
unknown_age=age_df[age_df.Age.isnull()].as_matrix()
y=known_age[:,0]
X=known_age[:,1:]
rfr=RandomForestRegressor(random_state=0,n_estimators=200,n_jobs=-1)
rfr.fit(X,y)
predictedAges=rfr.predict(unknown_age[:,1::])
df.loc[(df.Age.isnull()),'Age']=predictedAges
return df,rfr
def set_Cabin_type(df):
df.loc[df.Cabin.notnull(),'Cabin']='Yes'
df.loc[df.Cabin.isnull(),'Cabin']='No'
return df
def generate_new_data(df):
dummies_cabin=pandas.get_dummies(df.Cabin,prefix='Cabin')
dummies_Embarked=pandas.get_dummies(df.Embarked,prefix='Embarked')
dummies_Sex=pandas.get_dummies(df.Sex,prefix='Sex')
dummies_Pclass=pandas.get_dummies(df.Pclass,prefix='Pclass')
data=pandas.concat([df,dummies_cabin,dummies_Embarked,dummies_Sex,dummies_Pclass], axis=1)
data.drop(['Pclass','Name','Sex','Ticket','Cabin','Embarked'],axis=1,inplace=True)
return data
def scale_data(df):
import sklearn.preprocessing as preprocessing
df.Age=preprocessing.scale(df.Age)
df.Fare=preprocessing.scale(df.Fare)
return df
def check_bad_cases():
ori_data = pandas.read_csv('./train.csv', sep=',', header=0)
split_train,split_cv=cross_validation.train_test_split(ori_data,test_size=0.3,random_state=0)
st_df=split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
model=sklearn.linear_model.LogisticRegression(C=1.0,penalty='l1',tol=1e-1)
model.fit(st_df.as_matrix()[:,1:],st_df.as_matrix()[:,0])
cv_df=split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = model.predict(cv_df.as_matrix()[:,1:])
bad_cases= ori_data.loc[ori_data.PassengerId.isin(split_cv[predictions!=cv_df.as_matrix()[:,0]].PassengerId.values)]
print(bad_cases)
def cross_validate(model,X,y):
a= cross_validation.cross_val_score(model,X,y,cv=5)
print(a)
print(np.mean(a))
def check_model_parameter(train_df,model):
infer1 = pd.DataFrame({'columns':list(train_df.columns)[1:],'coef':list(model.coef_.T)})
print(infer1)
def draw_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_size=np.linspace(.05,1.,20),verbose=0,plot=True):
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
train_size,train_score,test_score=learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_size,verbose=verbose)
train_score_mean=np.mean(train_score,axis=1)
train_score_std=np.std(train_score,axis=1)
test_score_mean=np.mean(test_score,axis=1)
test_score_std=np.std(test_score,axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Number of training set')
plt.ylabel('Score')
plt.grid()
plt.fill_between(train_size,train_score_mean-train_score_std,train_score_mean+train_score_std,alpha=0.1,color='b')
plt.fill_between(train_size,test_score_mean-test_score_std,test_score_mean+test_score_std,alpha=0.1,color='r')
plt.plot(train_size,train_score_mean,'o-',color='b',label='Score in training set')
plt.plot(train_size,test_score_mean,'o-',color='r',label='Score in cv set')
plt.legend(loc='best')
plt.show()
midpoint = ((train_score_mean[-1]+train_score_std[-1]+test_score_mean[-1]-test_score_std[-1]))/2
diff = (train_score_mean[-1]+train_score_std[-1])-(test_score_mean[-1]-test_score_std[-1])
return midpoint,diff
if __name__ =='__main__':
print('read data from csv file')
data = pandas.read_csv('./train.csv', sep=',', header=0)
data,rfr=set_missing_ages(data)
data=set_Cabin_type(data)
show_data_in_figure(data)
data = generate_new_data(data)
scale_data(data)
train_df = data.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np=train_df.as_matrix()
y=train_np[:,0]
X=train_np[:,1:]
M = len(y)
pos = int(RATIO*M)
train_x = X[:pos]
train_y = y[:pos]
print(len(train_y))
test_x = X[pos:]
test_y = y[pos:]
print(len(test_y))
model = None
for i, name in enumerate(test_classifiers):
model = do_training(name, train_x, train_y, test_x, test_y)
draw_learning_curve(model,'Learning Curve',X,y)
test_data = pandas.read_csv('./test.csv', sep=',', header=0)
test_data,rfr=set_missing_ages(test_data)
test_data=set_Cabin_type(test_data)
test_data = generate_new_data(test_data)
scale_data(test_data)
train_df = test_data.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np=train_df.as_matrix()
m = len(train_np)
predict_y = np.ones(m)
predict_x=train_np[:]
for i, name in enumerate(test_classifiers):
result = do_predicting(name,predict_x, predict_y)
ans = test_data['PassengerId']
ansD = ans.to_frame()
other = pandas.DataFrame({'Survived':result})
ansD = ansD.join(other)
ansD.to_csv('./result.csv', columns=['PassengerId','Survived'], index = False)
import sys
import os
import time
from sklearn import metrics
import numpy as np
import _pickle as pickle
import sklearn.externals.joblib as jl
from svm import svm_problem, svm_parameter
from svmutil import svm_train, svm_predict, svm_save_model, svm_read_problem, svm_load_model
from sklearn.model_selection import *
def naive_bayes_classifier(train_x, train_y):
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.01)
model.fit(train_x, train_y)
return model
def naive_bayes_classifier2(train_x, train_y):
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_x, train_y)
return model
def naive_bayes_classifier3(train_x, train_y):
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(train_x, train_y)
return model
def knn_classifier(train_x, train_y):
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(train_x, train_y)
return model
def logistic_regression_classifier(train_x, train_y):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2')
model.fit(train_x, train_y)
return model
def random_forest_classifier(train_x, train_y):
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300)
model.fit(train_x, train_y)
return model
def decision_tree_classifier(train_x, train_y):
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(train_x, train_y)
return model
def gradient_boosting_classifier(train_x, train_y):
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_x, train_y)
return model
def bagging_classifier(train_x, train_y):
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator=None)
model.fit(train_x, train_y)
return model
def voting_classifier(train_x, train_y):
from sklearn.ensemble import VotingClassifier
estimators = []
from sklearn import tree
model1 = tree.DecisionTreeClassifier()
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression(penalty='l2')
estimators.append(['dt',model1])
estimators.append(['lr', model2])
estimators.append('lr','')
model = VotingClassifier(estimators=estimators)
model.fit(train_x, train_y)
return model
def ada_boosting_classifier(train_x, train_y):
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(base_estimator=None,n_estimators=300)
model.fit(train_x, train_y)
return model
def mlp_classifier(train_x, train_y):
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=200)
from sklearn.ensemble import BaggingClassifier
bagging_model = BaggingClassifier(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1)
bagging_model.fit(train_x, train_y)
return bagging_model
def svm_classifier(train_x, train_y):
from sklearn.svm import SVC
model = SVC(kernel='poly', probability=True)
model.fit(train_x, train_y)
return model
def svm_cross_validation(train_x, train_y):
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
model = SVC(kernel='poly', probability=True)
param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
grid_search = GridSearchCV(model, param_grid, n_jobs=1, verbose=1)
grid_search.fit(train_x, train_y)
best_parameters = grid_search.best_estimator_.get_params()
for para, val in best_parameters.items():
print(para, val)
model = SVC(kernel='poly', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
model.fit(train_x, train_y)
return model
def xgboost_classifier1(train_x, train_y):
from xgboost.sklearn import XGBClassifier
model = XGBClassifier()
model.fit(np.array(train_x), np.array(train_y))
return model
def xgboost_classifier(train_x, train_y):
from xgboost.sklearn import XGBClassifier
model = XGBClassifier(silent=1,
learning_rate=0.1,
n_estimators=60,
max_depth=6,
min_child_weight=0.4,
gamma=0.5,
subsample=0.4,
colsample_bytree=1,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=1000)
model.fit(train_x, train_y)
return model
def do_training(classifier_name,train_x,train_y,test_x,test_y):
model_save_file = str('./models/')+classifier_name+str('.model')
if classifier_name == 'LIBSVM':
prob = svm_problem(np.array(train_y).tolist(), np.array(train_x).tolist())
param = svm_parameter('-s 1 -t 1 -q -d 3')
model = svm_train(prob, param)
svm_save_model('./models/{}.model'.format(classifier_name), model)
svm_predict(np.array(test_y).tolist(), np.array(test_x).tolist(), model)
return model
model_save = {}
classifiers = {'NB': naive_bayes_classifier,
'KNN': knn_classifier,
'LR': logistic_regression_classifier,
'RF': random_forest_classifier,
'DT': decision_tree_classifier,
'SVM': svm_classifier,
'SVMCV': svm_cross_validation,
'GBDT': gradient_boosting_classifier,
'ADA':ada_boosting_classifier,
'MLP': mlp_classifier,
'XGBOOST': xgboost_classifier
}
model = classifiers[classifier_name](train_x, train_y)
model_save[classifier_name]=model
predict = model.predict(test_x)
accuracy = metrics.accuracy_score(test_y, predict)
print('accuracy: %.2f%%' % (100 * accuracy))
jl.dump(model_save, model_save_file)
return model
def drawline(x,y1,y2,y3,y4,title):
import matplotlib.pyplot as plt
plt.subplot(2,2,1)
plt.title(title)
plt.xlabel('index')
plt.ylabel('value')
plt.plot(x,y1)
plt.subplot(2,2,2)
plt.title(title)
plt.xlabel('index')
plt.ylabel('value')
plt.plot(x,y2)
plt.subplot(2,2,3)
plt.xlabel('index')
plt.ylabel('value')
plt.plot(x,y3)
plt.subplot(2,2,4)
plt.xlabel('index')
plt.ylabel('value')
plt.plot(x,y4)
plt.show()
def do_predicting(classifier_name,test_x,test_y):
model_save_file = str('./models/')+classifier_name+str('.model')
if classifier_name == 'LIBSVM':
model = svm_load_model('./models/{}.model'.format(classifier_name))
p_labels, p_acc, p_vals = svm_predict(test_y, np.array(test_x).tolist(), model)
return p_labels
classifiers = {'NB': naive_bayes_classifier,
'KNN': knn_classifier,
'LR': logistic_regression_classifier,
'RF': random_forest_classifier,
'DT': decision_tree_classifier,
'SVM': svm_classifier,
'SVMCV': svm_cross_validation,
'GBDT': gradient_boosting_classifier,
'ADA':ada_boosting_classifier,
'MLP': mlp_classifier
}
model = jl.load(model_save_file)[classifier_name]
predict = model.predict(test_x)
return predict