kaggle base line: Mercari Price Suggestion Challenge

import math
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from pandas import  *

import sklearn
from sklearn.preprocessing import *
from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.pipeline import *
from sklearn.tree import *
from sklearn.svm import *
from sklearn.feature_extraction import *
from sklearn.neighbors import *
from xgboost.sklearn import *
from lightgbm.sklearn import *

from pickle import *
from sklearn.externals.joblib import *

from keras.preprocessing.text import *
from keras.preprocessing.sequence import pad_sequences


MY_PLOT_SHOW=True

def __set_missing_values(train, test):
    train.brand_name.fillna('xiaofei', inplace=True)
    train.category_name.fillna('xiaofei', inplace=True)
    train.item_description.fillna('xiaofei', inplace=True)

    test.brand_name.fillna('xiaofei', inplace=True)
    test.category_name.fillna('xiaofei', inplace=True)
    test.item_description.fillna('xiaofei', inplace=True)
    
def __set_label_encoder(train,test):
    
    le = LabelEncoder()#sort all strings by alphabit,transform will show index according to fit
    le.fit(np.hstack([train.category_name, test.category_name]))
    train.category_name=le.transform(train.category_name)
    test.category_name=le.transform(test.category_name)
    del le
    
    le = LabelEncoder()#sort all strings by alphabit,transform will show index according to fit
    le.fit(np.hstack([train.brand_name, test.brand_name]))
    train.brand_name=le.transform(train.brand_name)
    test.brand_name=le.transform(test.brand_name)
    del le


def __2Dto1D_BAD(train,test):

    
    train.loc[:,('item_description_std')]=train.item_description.apply(lambda x: np.std(x))
    train.loc[:,('name_std')]=train.name.apply(lambda x:np.std(x))
    train.loc[:,('item_description')]=train.item_description.apply(lambda x: np.mean(x))
    train.loc[:,('name')]=train.name.apply(lambda x:np.mean(x))
    
    test.loc[:, ('item_description_std')] = test.item_description.apply(lambda x: np.std(x))
    test.loc[:, ('name_std')] = test.name.apply(lambda x: np.std(x))
    test.loc[:,('item_description')]=test.item_description.apply(lambda x: np.mean(x))
    test.loc[:,('name')]=test.name.apply(lambda x:np.mean(x))

    train_column_names.append('name_std')
    train_column_names.append('item_description_std')
    test_column_names.append('name_std')
    test_column_names.append('item_description_std')

def __set_tokenizer(train, test):
    
    token = Tokenizer()
    token.fit_on_texts(np.hstack([train.item_description.str.lower(),train.name.str.lower()]))#word_counts,word_index,word_docs. text_to_sequence will show index according to word_index
    train['item_description'] = token.texts_to_sequences(train.item_description.str.lower())
    test['item_description'] = token.texts_to_sequences(test.item_description.str.lower())
    
    train['name'] = token.texts_to_sequences(train.name.str.lower())
    test['name'] = token.texts_to_sequences(test.name.str.lower())
    
    return


def my_print_datas(train):
    set_option('display.width', 100)
    set_option('precision', 2)
    print train.shape
    print train.head(5)
    print train.dtypes  # if need transform object to value
    print train.describe()
    print train.info()  # if need handle missing data
    print train.groupby('item_condition_id').size()  # if need standarize


def my_draw_datas(train):
    if MY_PLOT_SHOW:
        print train.corr(method='pearson')  # the bigger between features, the worse -1~+1
        print train.skew()  # 0 is best,left or right base, if need standarize
        
        train.hist()  # if gaussian distribution
        plt.show()
        
        train.plot(kind='density', subplots=True, layout=(3, 3), sharex=False)
        plt.show()
        
        train.plot(kind='box', subplots=True, layout=(3, 3), sharex=False)
        plt.show()
        
        correlations = train.corr()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(correlations, vmin=-1, vmax=1)
        fig.colorbar(cax)
        ticks = np.arange(0, 9, 1)
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        ax.set_xticklabels(train_column_names)
        ax.set_yticklabels(train_column_names)
        plt.show()
        
        scatter_matrix(train)
        plt.show()

def __2Dto1D(train,test):
    max_name_seq = np.max(
        [np.max(train.name.apply(lambda x: len(x))), np.max(test.name.apply(lambda x: len(x)))])
    max_seq_item_description = np.max([np.max(train.item_description.apply(lambda x: len(x)))
                                          , np.max(test.item_description.apply(lambda x: len(x)))])
    print max_name_seq, max_seq_item_description
    
    if MY_PLOT_SHOW:
        train.item_description.apply(lambda x: len(x)).hist()
        train.name.apply(lambda x: len(x)).hist()
    
    estimated_name_len=10
    estimated_item_des_len=75

    train_data_01 = pad_sequences(train.name,maxlen=estimated_name_len)
    test_data_01 = pad_sequences(test.name,maxlen=estimated_name_len)
    pca=PCA(n_components=estimated_name_len, copy=False)
    fit = pca.fit(train_data_01)
    print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
    print fit.components_
    train_data_01 = fit.transform(train_data_01)
    test_data_01 = fit.transform(test_data_01)


    
    train_data_02 = pad_sequences(train.item_description,maxlen=estimated_item_des_len)
    test_data_02 = pad_sequences(test.item_description,maxlen=estimated_item_des_len)
    pca=PCA(n_components=estimated_item_des_len, copy=False)
    fit = pca.fit(train_data_02)
    print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
    print fit.components_
    train_data_02 = fit.transform(train_data_02)
    test_data_02 = fit.transform(test_data_02)
    
    
    
    x_train=np.hstack([train_data_01,
                       train.item_condition_id.as_matrix().reshape(-1,1),
                       train.category_name.as_matrix().reshape(-1,1),
                       train.brand_name.as_matrix().reshape(-1,1),
                       train.shipping.as_matrix().reshape(-1,1),
                       train_data_02])
    y_train=train.price.as_matrix()
    x_test=np.hstack([test_data_01,
                      test.item_condition_id.as_matrix().reshape(-1,1),
                      test.category_name.as_matrix().reshape(-1,1),
                      test.brand_name.as_matrix().reshape(-1,1),
                      test.shipping.as_matrix().reshape(-1,1),
                      test_data_02])
    return x_train,y_train,x_test




def my_feature_extraction(train, rest):
    __set_missing_values(train, test)
    __set_label_encoder(train, test)
    __set_tokenizer(train, test)
    x_train,y_train,x_test=__2Dto1D(train,test)
    #__2Dto1D_BAD(train,test)
    
    return x_train,y_train,x_test

def my_preprocessing_data(x_train_f, x_test_f):
    '''
    #a&b can fit to same unit
    # a. MinMaxScaler(feature_range=(0,1))--------for g d t
    # b. StandardScaler(),scale()--------for gaussian input, LR, LR, LDA
    # c. Normalizer(copy=True,norm='l2'),normalize()----------for sparse feature, NN KNN. Most used in text classification, or cluster
    # d. Binarizer(copy=True,threshold=0.0)
    '''
    
    mm = MinMaxScaler(feature_range=(0,1))
    x_train_f = mm.fit_transform(x_train_f)
    x_test_f = mm.transform(x_test_f)
    
    '''
    ss = StandardScaler()
    train.name =ss.fit_transform(train.name.reshape(-1,1))
    train.category_name = ss.fit_transform(train.category_name.reshape(-1,1))
    train.brand_name = ss.fit_transform(train.brand_name.reshape(-1,1))
    train.item_description = ss.fit_transform(train.item_description.reshape(-1,1))
    train.item_description_std = ss.fit_transform(train.item_description_std.reshape(-1,1))
    train.name_std = ss.fit_transform(train.name_std.reshape(-1,1))
    '''

    return x_train_f, x_test_f
    
def my_feature_selection(x_train_f,y_train_f,x_test_f):

    
    ftest = SelectKBest(score_func=chi2, k=80)
    fit = ftest.fit(x_train_f, y_train_f)
    print(fit.scores_)#the bigger score, the better feature
    x_train = fit.transform(x_train_f)
    x_test = fit.transform(x_test_f)

    
    '''
    VarianceThreshold(threshold=3).fit_transform(train[test_column_names])
    
    model = LinearRegression()
    rfe=RFE(model,len(test_column_names)-1)
    fit=rfe.fit(train[test_column_names], train[label_column_names])
    print fit.n_features_
    print fit.support_
    print fit.ranking_
    fit.transform(train[test_column_names])
    
    
    pca=PCA(n_components=3, copy=False)
    fit = pca.fit(train[test_column_names])
    print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
    print fit.components_
    print fit.transform(train[test_column_names])


    model = ExtraTreesRegressor()
    fit=model.fit(train[test_column_names], train[label_column_names])
    print fit.feature_importances_
    '''
    return x_train,y_train_f,x_test


# def __rmsle(y,predicted):
#     return np.sqrt(np.mean(np.square(np.log(1+predicted)-np.log(1+y))))
    
def __rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(math.fabs(y_pred[i]) + 1) - math.log(math.fabs(y[i]) + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

def my_normal_model_selection(X_train,y_train):
    '''
    classification:
    linear: LogisticRegression,LinearDiscriminantAnalysis
    non-linear:KNeighborsClassifier,GaussianNB,DecisionTreeClassifier,SVC

    regression:
    linear:LinearRegression,Ridge,Lasso,ElasticNet
    non-linear:KNeighborsRegressor,DecisionTreeRegressor,SVR
    '''

    regression_piplies={}
    regression_piplies['LR']=Pipeline([('LR',LinearRegression())])
    regression_piplies['RIGE']=Pipeline([('RIGE',Ridge())])
    regression_piplies['LA']=Pipeline([('LA',Lasso())])
    regression_piplies['EN']=Pipeline([('EN',ElasticNet())])
    regression_piplies['KN']=Pipeline([('KN',KNeighborsRegressor())])
    regression_piplies['DT']=Pipeline([('DT',DecisionTreeRegressor())])
    regression_piplies['SVM']=Pipeline([('SVM',SVR())])
    results=[]

    
    for key in regression_piplies:
        kf = KFold(n_splits=5,random_state=7)
        cv_result=cross_val_score(regression_piplies[key],X_train, y_train,cv=kf,scoring=make_scorer(score_func=__rmsle, greater_is_better=True), n_jobs=-1)
        results.append(cv_result)
        print key, cv_result.mean(),cv_result.std()
        
    if MY_PLOT_SHOW:
        fig=plt.figure()
        fig.suptitle("Algorithm Comparison")
        ax=fig.add_subplot(111)
        plt.boxplot(results)
        ax.set_xticklabels(regression_piplies.keys())
        plt.show()

    #select the best non-ensemble model to do grid-search
    model=None
    parameters={'kernel':['linear'],'degree':[1],'epsilon':[0.1],'gamma':['auto'],'C':[1]}

    model = SVR()
    best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
    print best_score, best_params


    # model=None
    # parameters={'n_neighbors':[10],'algorithm':['auto'],'leaf_size':[30],'p':[2]}
    #
    # model = KNeighborsRegressor()
    # best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
    # print best_score, best_params

    '''
    #pipeline ex1
    steps=[]
    steps.append(('Standarize',StandardScaler()))
    steps.append(('lr',LinearRegression()))
    model = Pipeline(steps)
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy

    # pipeline ex2
    features=[]
    features.append(('pca',PCA()))
    features.append(('select_best',SelectKBest(k=6)))
    steps=[]
    steps.append(('feature_union',FeatureUnion(features)))
    steps.append(('lr',LinearRegression()))
    model=Pipeline(steps)
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy


    #both for classification and regression
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    
    
    #only for classifier
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_log_loss')
    print result.mean(), result.std()#neg_log_loss
    
    # only for classifier
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='roc_auc')
    print result.mean(), result.std()#roc_auc

    # only for classifier
    test_size=0.33
    seed=4
    x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
    model=LinearRegression()
    model.fit(x_train,y_train)
    predicted=model.predict(x_test)
    matrix = confusion_matrix(y_test,predicted)
    print matrix

    # only for classifier
    test_size=0.33
    seed=4
    x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
    model=LinearRegression()
    model.fit(x_train,y_train)
    predicted=model.predict(x_test)
    report = classification_report(y_test,predicted)
    print report

    #only for regression
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_mean_absolute_error')
    print result.mean(), result.std()#neg_mean_absolute_error

    # only for regression
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_mean_squared_error')
    print result.mean(), result.std()#neg_mean_squared_error

    # only for regression
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='r2')
    print result.mean(), result.std()#r2
    '''
    return model
    
    
def my_ensemble_model_selection(X_train,y_train):
    
    ensembles={}
    ensembles['BAG']=Pipeline([('BAG',BaggingRegressor())])
    ensembles['RF']=Pipeline([('RF',RandomForestRegressor())])
    ensembles['ET']=Pipeline([('ET',ExtraTreesRegressor())])

    ensembles['ADA']=Pipeline([('ADA',AdaBoostRegressor())])
    ensembles['GB']=Pipeline([('GB',GradientBoostingRegressor())])
    ensembles['XGB']=Pipeline([('XGB',XGBRegressor())])
    #ensembles['GBM']=Pipeline([('GBM',LGBMRegressor())])

    
    results = []
    for key in ensembles:
        kf = KFold(n_splits=5, random_state=7)
        cv_result = cross_val_score(ensembles[key], X_train,y_train, cv=kf,n_jobs=-1, scoring=make_scorer(score_func=__rmsle, greater_is_better=True))
        results.append(cv_result)
        print key, cv_result.mean(), cv_result.std()

    if MY_PLOT_SHOW:
        fig = plt.figure()
        fig.suptitle("Algorithm Comparison")
        ax = fig.add_subplot(111)
        plt.boxplot(results)
        ax.set_xticklabels(ensembles.keys())
        plt.show()
    
    
    #select the best ensemble model to do grid-search
    model=XGBRegressor()
    parameters={'n_estimators':[10],'learning_rate':[0.1],'max_depth':[1],'booster':['gbtree'],'min_child_weight':[1],'subsample':[1.0],'random_state':[10]}
    best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
    print best_score, best_params

    a=1
    
    '''
    #Bagging methods:
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    cart=DecisionTreeRegressor()
    model = BaggingRegressor(base_estimator=cart,n_estimators=100,random_state=7)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    
    
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model=RandomForestRegressor(n_estimators=100,random_state=7,max_features=3)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    
    
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model=ExtraTreesRegressor(n_estimators=100,random_state=7,max_features=3)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    
    #Boosting methods:
    
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    cart=DecisionTreeRegressor()
    model=AdaBoostRegressor(n_estimators=100,random_state=7,base_estimator=cart)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
  
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model=GradientBoostingRegressor(n_estimators=100,random_state=7)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    
    #Voting
    models=[]
    models.append(('lr',LinearRegression()))
    models.append(('svm',SVR()))
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model=VotingClassifier(estimators=models)
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
    print result.mean(), result.std()#accuracy
    '''
    return model

def __my_cross_validate(train):
    test_size=0.33
    seed=4
    x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
    model=LinearRegression()
    model.fit(x_train,y_train)
    result=model.score(x_test,y_test)
    print result
    
    kf = KFold(n_splits=5,random_state=seed,shuffle=True)
    for x_train_index,x_test_index in kf.split(train[test_column_names], train[label_column_names]):
        print train.as_matrix()[x_train_index]
    print x_train

    loocv = LeaveOneOut()#ShuffleSplit,KFold
    model = LinearRegression()
    result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=loocv)
    print result.mean(), result.std()
    
def __my_grid_search(x,y,model,parameters):
    #apply when parameters are less than 3
    kf = KFold(random_state=7, n_splits=5)#ShuffleSplit,KFold
    grid = GridSearchCV(estimator=model, param_grid=parameters, cv=kf, n_jobs=-1,scoring=make_scorer(score_func=__rmsle, greater_is_better=False))#n_jobs is important for time saving
    grid.fit(x,y)
    print grid.get_params()
    
    '''
    #apply when parameters are more than 3
    kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
    model=GradientBoostingRegressor(n_estimators=100,random_state=7)
    param_ids={'n_estimators':100,'max_depth':[1,2,3,4]}
    grid = RandomizedSearchCV(estimator=model,param_distributions=param_ids,n_iter=100,random_state=7, cv=kf)
    grid.fit(train[test_column_names], train[label_column_names])
    print grid.best_score_, grid.best_params_
    '''
    
    return grid.best_score_, grid.best_params_
    
def my_save_model(train):
    # Voting
    models = []
    models.append(('lr', LinearRegression()))
    models.append(('svm', SVR()))
    kf = KFold(random_state=7, n_splits=5)  # ShuffleSplit,KFold
    model = VotingClassifier(estimators=models)
    result = cross_val_score(model, train[test_column_names], train[label_column_names], cv=kf)
    print result.mean(), result.std()  # accuracy
    
    dump(model,'./test.model')
    my_model = load('./test.model')
    result = cross_val_score(my_model, train[test_column_names], train[label_column_names], cv=kf)
    print result.mean(), result.std()  # accuracy
    
def my_draw_learning_curve(estimator,X,y,train_sizes=np.linspace(.05,1.,20)):
    if MY_PLOT_SHOW:
        train_size,train_score,test_score=learning_curve(estimator,X,y,train_sizes=train_sizes, scoring=make_scorer(__rmsle,greater_is_better=False))
        train_score_mean=np.mean(train_score,axis=1)
        train_score_std=np.std(train_score,axis=1)
        test_score_mean=np.mean(test_score,axis=1)
        test_score_std=np.std(test_score,axis=1)
    
        plt.figure()
        plt.title('Learning Curve')
        plt.xlabel('Number of training set')
        plt.ylabel('Score')
        plt.grid()
    
        plt.fill_between(train_size,train_score_mean-train_score_std,train_score_mean+train_score_std,alpha=0.1,color='b')
        plt.fill_between(train_size,test_score_mean-test_score_std,test_score_mean+test_score_std,alpha=0.1,color='r')
        plt.plot(train_size,train_score_mean,'o-',color='b',label='Score in training set')
        plt.plot(train_size,test_score_mean,'o-',color='r',label='Score in cv set')
    
        plt.legend(loc='best')
        plt.show()
    
        midpoint = ((train_score_mean[-1]+train_score_std[-1]+test_score_mean[-1]-test_score_std[-1]))/2
        diff = (train_score_mean[-1]+train_score_std[-1])-(test_score_mean[-1]-test_score_std[-1])
        return midpoint,diff

MYROWS=1000
train_column_names = ['name', 'item_condition_id', 'category_name', 'brand_name', 'price', 'shipping', 'item_description']
test_column_names = ['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping','item_description']
label_column_names = ['price']

if __name__ =='__main__':
    
    #1. read data
    oringin_train = pd.read_table('./train.tsv', nrows=MYROWS)
    oringin_test = pd.read_table('./test.tsv', nrows=MYROWS)
    train = oringin_train[train_column_names].copy()
    test = oringin_test[test_column_names].copy()
    
    #==================================Feature Engineering Start========================================================
    #2. understand data,can be called everywhere serveral times
    my_print_datas(train)
    
    #3. watch data again, draw data,can be called everywhere serveral times
    my_draw_datas(train)
    
    #4. feature_extraction,fill_missing_data,one-hot,labelcoder,tokenizer,padsequence
    #all data to numberic
    x_train_f, y_train_f, x_test_f=my_feature_extraction(train, test)

    
    #5. preprocessing,standarize,scale,normalizer,minmaxselector
    x_train_f, x_test_f=my_preprocessing_data(x_train_f, x_test_f)
    
    #6. feature selection,K, feature_importance
    X_train,y_train,X_test = my_feature_selection(x_train_f,y_train_f,x_test_f)
    # ==================================Feature Engineering End=========================================================
    
    #7. normal model selection, pipelines, gridsearch, crossvalidate
    estimator1 = my_normal_model_selection(X_train,y_train)
    
    #8. ensemble model selection, pipelines, gridsearch, crossvalidate
    estimator2 = my_ensemble_model_selection(X_train,y_train)

    model = estimator1
    #9. draw leanring curve
    my_draw_learning_curve(model,X_train,y_train)

    #9. serialize
    my_save_model(train)

    MAX_NAME_SEQ = 10
    MAX_ITEM_DESC_SEQ = 75
    MAX_TEXT = np.max([np.max(train.seq_name.max())
                          , np.max(test.seq_name.max())
                          , np.max(train.seq_item_description.max())
                          , np.max(test.seq_item_description.max())]) + 2
    MAX_CATEGORY = np.max([train.category_name.max(), test.category_name.max()]) + 1
    MAX_BRAND = np.max([train.brand_name.max(), test.brand_name.max()]) + 1
    MAX_CONDITION = np.max([train.item_condition_id.max(), test.item_condition_id.max()]) + 1

    #train["target"] = np.log(train.price + 1)
    #target_scaler = MinMaxScaler(feature_range=(-1, 1))
    #train["target"] = target_scaler.fit_transform(train.target.reshape(-1, 1))
    #pd.DataFrame(train.target).hist()
    
    print len(train)

    dtrain, dvalid = train_test_split(train, random_state=123, train_size=RATIO)


    x_train=fill_data(dtrain,MAX_NAME_SEQ,MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
    y_train=dtrain.price
    x_valid=fill_data(dvalid, MAX_NAME_SEQ, MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
    y_valid = dvalid.price
    
    x_test = fill_data(test, MAX_NAME_SEQ, MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
    y_test = np.ones(len(x_test))
    

   #
   # result = do_predicting(name,x_test.as_matrix(), y_test)
   # ans = test['test_id']
   # ansD = ans.to_frame()
   # other = pandas.DataFrame({'price':result})
   # #print other
   # ansD = ansD.join(other)
   # #print ansD
   # ansD.to_csv('./result.csv', columns=['test_id','price'], index = False)

你可能感兴趣的:(Python,Machine,Learning)