import math
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from pandas import *
import sklearn
from sklearn.preprocessing import *
from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.pipeline import *
from sklearn.tree import *
from sklearn.svm import *
from sklearn.feature_extraction import *
from sklearn.neighbors import *
from xgboost.sklearn import *
from lightgbm.sklearn import *
from pickle import *
from sklearn.externals.joblib import *
from keras.preprocessing.text import *
from keras.preprocessing.sequence import pad_sequences
MY_PLOT_SHOW=True
def __set_missing_values(train, test):
train.brand_name.fillna('xiaofei', inplace=True)
train.category_name.fillna('xiaofei', inplace=True)
train.item_description.fillna('xiaofei', inplace=True)
test.brand_name.fillna('xiaofei', inplace=True)
test.category_name.fillna('xiaofei', inplace=True)
test.item_description.fillna('xiaofei', inplace=True)
def __set_label_encoder(train,test):
le = LabelEncoder()#sort all strings by alphabit,transform will show index according to fit
le.fit(np.hstack([train.category_name, test.category_name]))
train.category_name=le.transform(train.category_name)
test.category_name=le.transform(test.category_name)
del le
le = LabelEncoder()#sort all strings by alphabit,transform will show index according to fit
le.fit(np.hstack([train.brand_name, test.brand_name]))
train.brand_name=le.transform(train.brand_name)
test.brand_name=le.transform(test.brand_name)
del le
def __2Dto1D_BAD(train,test):
train.loc[:,('item_description_std')]=train.item_description.apply(lambda x: np.std(x))
train.loc[:,('name_std')]=train.name.apply(lambda x:np.std(x))
train.loc[:,('item_description')]=train.item_description.apply(lambda x: np.mean(x))
train.loc[:,('name')]=train.name.apply(lambda x:np.mean(x))
test.loc[:, ('item_description_std')] = test.item_description.apply(lambda x: np.std(x))
test.loc[:, ('name_std')] = test.name.apply(lambda x: np.std(x))
test.loc[:,('item_description')]=test.item_description.apply(lambda x: np.mean(x))
test.loc[:,('name')]=test.name.apply(lambda x:np.mean(x))
train_column_names.append('name_std')
train_column_names.append('item_description_std')
test_column_names.append('name_std')
test_column_names.append('item_description_std')
def __set_tokenizer(train, test):
token = Tokenizer()
token.fit_on_texts(np.hstack([train.item_description.str.lower(),train.name.str.lower()]))#word_counts,word_index,word_docs. text_to_sequence will show index according to word_index
train['item_description'] = token.texts_to_sequences(train.item_description.str.lower())
test['item_description'] = token.texts_to_sequences(test.item_description.str.lower())
train['name'] = token.texts_to_sequences(train.name.str.lower())
test['name'] = token.texts_to_sequences(test.name.str.lower())
return
def my_print_datas(train):
set_option('display.width', 100)
set_option('precision', 2)
print train.shape
print train.head(5)
print train.dtypes # if need transform object to value
print train.describe()
print train.info() # if need handle missing data
print train.groupby('item_condition_id').size() # if need standarize
def my_draw_datas(train):
if MY_PLOT_SHOW:
print train.corr(method='pearson') # the bigger between features, the worse -1~+1
print train.skew() # 0 is best,left or right base, if need standarize
train.hist() # if gaussian distribution
plt.show()
train.plot(kind='density', subplots=True, layout=(3, 3), sharex=False)
plt.show()
train.plot(kind='box', subplots=True, layout=(3, 3), sharex=False)
plt.show()
correlations = train.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, 9, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(train_column_names)
ax.set_yticklabels(train_column_names)
plt.show()
scatter_matrix(train)
plt.show()
def __2Dto1D(train,test):
max_name_seq = np.max(
[np.max(train.name.apply(lambda x: len(x))), np.max(test.name.apply(lambda x: len(x)))])
max_seq_item_description = np.max([np.max(train.item_description.apply(lambda x: len(x)))
, np.max(test.item_description.apply(lambda x: len(x)))])
print max_name_seq, max_seq_item_description
if MY_PLOT_SHOW:
train.item_description.apply(lambda x: len(x)).hist()
train.name.apply(lambda x: len(x)).hist()
estimated_name_len=10
estimated_item_des_len=75
train_data_01 = pad_sequences(train.name,maxlen=estimated_name_len)
test_data_01 = pad_sequences(test.name,maxlen=estimated_name_len)
pca=PCA(n_components=estimated_name_len, copy=False)
fit = pca.fit(train_data_01)
print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
print fit.components_
train_data_01 = fit.transform(train_data_01)
test_data_01 = fit.transform(test_data_01)
train_data_02 = pad_sequences(train.item_description,maxlen=estimated_item_des_len)
test_data_02 = pad_sequences(test.item_description,maxlen=estimated_item_des_len)
pca=PCA(n_components=estimated_item_des_len, copy=False)
fit = pca.fit(train_data_02)
print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
print fit.components_
train_data_02 = fit.transform(train_data_02)
test_data_02 = fit.transform(test_data_02)
x_train=np.hstack([train_data_01,
train.item_condition_id.as_matrix().reshape(-1,1),
train.category_name.as_matrix().reshape(-1,1),
train.brand_name.as_matrix().reshape(-1,1),
train.shipping.as_matrix().reshape(-1,1),
train_data_02])
y_train=train.price.as_matrix()
x_test=np.hstack([test_data_01,
test.item_condition_id.as_matrix().reshape(-1,1),
test.category_name.as_matrix().reshape(-1,1),
test.brand_name.as_matrix().reshape(-1,1),
test.shipping.as_matrix().reshape(-1,1),
test_data_02])
return x_train,y_train,x_test
def my_feature_extraction(train, rest):
__set_missing_values(train, test)
__set_label_encoder(train, test)
__set_tokenizer(train, test)
x_train,y_train,x_test=__2Dto1D(train,test)
#__2Dto1D_BAD(train,test)
return x_train,y_train,x_test
def my_preprocessing_data(x_train_f, x_test_f):
'''
#a&b can fit to same unit
# a. MinMaxScaler(feature_range=(0,1))--------for g d t
# b. StandardScaler(),scale()--------for gaussian input, LR, LR, LDA
# c. Normalizer(copy=True,norm='l2'),normalize()----------for sparse feature, NN KNN. Most used in text classification, or cluster
# d. Binarizer(copy=True,threshold=0.0)
'''
mm = MinMaxScaler(feature_range=(0,1))
x_train_f = mm.fit_transform(x_train_f)
x_test_f = mm.transform(x_test_f)
'''
ss = StandardScaler()
train.name =ss.fit_transform(train.name.reshape(-1,1))
train.category_name = ss.fit_transform(train.category_name.reshape(-1,1))
train.brand_name = ss.fit_transform(train.brand_name.reshape(-1,1))
train.item_description = ss.fit_transform(train.item_description.reshape(-1,1))
train.item_description_std = ss.fit_transform(train.item_description_std.reshape(-1,1))
train.name_std = ss.fit_transform(train.name_std.reshape(-1,1))
'''
return x_train_f, x_test_f
def my_feature_selection(x_train_f,y_train_f,x_test_f):
ftest = SelectKBest(score_func=chi2, k=80)
fit = ftest.fit(x_train_f, y_train_f)
print(fit.scores_)#the bigger score, the better feature
x_train = fit.transform(x_train_f)
x_test = fit.transform(x_test_f)
'''
VarianceThreshold(threshold=3).fit_transform(train[test_column_names])
model = LinearRegression()
rfe=RFE(model,len(test_column_names)-1)
fit=rfe.fit(train[test_column_names], train[label_column_names])
print fit.n_features_
print fit.support_
print fit.ranking_
fit.transform(train[test_column_names])
pca=PCA(n_components=3, copy=False)
fit = pca.fit(train[test_column_names])
print fit.explained_variance_ratio_#bigger, means can explain more other features by itself
print fit.components_
print fit.transform(train[test_column_names])
model = ExtraTreesRegressor()
fit=model.fit(train[test_column_names], train[label_column_names])
print fit.feature_importances_
'''
return x_train,y_train_f,x_test
# def __rmsle(y,predicted):
# return np.sqrt(np.mean(np.square(np.log(1+predicted)-np.log(1+y))))
def __rmsle(y, y_pred):
assert len(y) == len(y_pred)
to_sum = [(math.log(math.fabs(y_pred[i]) + 1) - math.log(math.fabs(y[i]) + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
def my_normal_model_selection(X_train,y_train):
'''
classification:
linear: LogisticRegression,LinearDiscriminantAnalysis
non-linear:KNeighborsClassifier,GaussianNB,DecisionTreeClassifier,SVC
regression:
linear:LinearRegression,Ridge,Lasso,ElasticNet
non-linear:KNeighborsRegressor,DecisionTreeRegressor,SVR
'''
regression_piplies={}
regression_piplies['LR']=Pipeline([('LR',LinearRegression())])
regression_piplies['RIGE']=Pipeline([('RIGE',Ridge())])
regression_piplies['LA']=Pipeline([('LA',Lasso())])
regression_piplies['EN']=Pipeline([('EN',ElasticNet())])
regression_piplies['KN']=Pipeline([('KN',KNeighborsRegressor())])
regression_piplies['DT']=Pipeline([('DT',DecisionTreeRegressor())])
regression_piplies['SVM']=Pipeline([('SVM',SVR())])
results=[]
for key in regression_piplies:
kf = KFold(n_splits=5,random_state=7)
cv_result=cross_val_score(regression_piplies[key],X_train, y_train,cv=kf,scoring=make_scorer(score_func=__rmsle, greater_is_better=True), n_jobs=-1)
results.append(cv_result)
print key, cv_result.mean(),cv_result.std()
if MY_PLOT_SHOW:
fig=plt.figure()
fig.suptitle("Algorithm Comparison")
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(regression_piplies.keys())
plt.show()
#select the best non-ensemble model to do grid-search
model=None
parameters={'kernel':['linear'],'degree':[1],'epsilon':[0.1],'gamma':['auto'],'C':[1]}
model = SVR()
best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
print best_score, best_params
# model=None
# parameters={'n_neighbors':[10],'algorithm':['auto'],'leaf_size':[30],'p':[2]}
#
# model = KNeighborsRegressor()
# best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
# print best_score, best_params
'''
#pipeline ex1
steps=[]
steps.append(('Standarize',StandardScaler()))
steps.append(('lr',LinearRegression()))
model = Pipeline(steps)
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
# pipeline ex2
features=[]
features.append(('pca',PCA()))
features.append(('select_best',SelectKBest(k=6)))
steps=[]
steps.append(('feature_union',FeatureUnion(features)))
steps.append(('lr',LinearRegression()))
model=Pipeline(steps)
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
#both for classification and regression
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
#only for classifier
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_log_loss')
print result.mean(), result.std()#neg_log_loss
# only for classifier
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='roc_auc')
print result.mean(), result.std()#roc_auc
# only for classifier
test_size=0.33
seed=4
x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
model=LinearRegression()
model.fit(x_train,y_train)
predicted=model.predict(x_test)
matrix = confusion_matrix(y_test,predicted)
print matrix
# only for classifier
test_size=0.33
seed=4
x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
model=LinearRegression()
model.fit(x_train,y_train)
predicted=model.predict(x_test)
report = classification_report(y_test,predicted)
print report
#only for regression
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_mean_absolute_error')
print result.mean(), result.std()#neg_mean_absolute_error
# only for regression
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='neg_mean_squared_error')
print result.mean(), result.std()#neg_mean_squared_error
# only for regression
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf, scoring='r2')
print result.mean(), result.std()#r2
'''
return model
def my_ensemble_model_selection(X_train,y_train):
ensembles={}
ensembles['BAG']=Pipeline([('BAG',BaggingRegressor())])
ensembles['RF']=Pipeline([('RF',RandomForestRegressor())])
ensembles['ET']=Pipeline([('ET',ExtraTreesRegressor())])
ensembles['ADA']=Pipeline([('ADA',AdaBoostRegressor())])
ensembles['GB']=Pipeline([('GB',GradientBoostingRegressor())])
ensembles['XGB']=Pipeline([('XGB',XGBRegressor())])
#ensembles['GBM']=Pipeline([('GBM',LGBMRegressor())])
results = []
for key in ensembles:
kf = KFold(n_splits=5, random_state=7)
cv_result = cross_val_score(ensembles[key], X_train,y_train, cv=kf,n_jobs=-1, scoring=make_scorer(score_func=__rmsle, greater_is_better=True))
results.append(cv_result)
print key, cv_result.mean(), cv_result.std()
if MY_PLOT_SHOW:
fig = plt.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys())
plt.show()
#select the best ensemble model to do grid-search
model=XGBRegressor()
parameters={'n_estimators':[10],'learning_rate':[0.1],'max_depth':[1],'booster':['gbtree'],'min_child_weight':[1],'subsample':[1.0],'random_state':[10]}
best_score, best_params = __my_grid_search(X_train,y_train,model,parameters)
print best_score, best_params
a=1
'''
#Bagging methods:
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
cart=DecisionTreeRegressor()
model = BaggingRegressor(base_estimator=cart,n_estimators=100,random_state=7)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model=RandomForestRegressor(n_estimators=100,random_state=7,max_features=3)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model=ExtraTreesRegressor(n_estimators=100,random_state=7,max_features=3)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
#Boosting methods:
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
cart=DecisionTreeRegressor()
model=AdaBoostRegressor(n_estimators=100,random_state=7,base_estimator=cart)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model=GradientBoostingRegressor(n_estimators=100,random_state=7)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
#Voting
models=[]
models.append(('lr',LinearRegression()))
models.append(('svm',SVR()))
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model=VotingClassifier(estimators=models)
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=kf)
print result.mean(), result.std()#accuracy
'''
return model
def __my_cross_validate(train):
test_size=0.33
seed=4
x_train,x_test,y_train,y_test=train_test_split(train[test_column_names], train[label_column_names],test_size=test_size,random_state=seed)
model=LinearRegression()
model.fit(x_train,y_train)
result=model.score(x_test,y_test)
print result
kf = KFold(n_splits=5,random_state=seed,shuffle=True)
for x_train_index,x_test_index in kf.split(train[test_column_names], train[label_column_names]):
print train.as_matrix()[x_train_index]
print x_train
loocv = LeaveOneOut()#ShuffleSplit,KFold
model = LinearRegression()
result = cross_val_score(model,train[test_column_names], train[label_column_names],cv=loocv)
print result.mean(), result.std()
def __my_grid_search(x,y,model,parameters):
#apply when parameters are less than 3
kf = KFold(random_state=7, n_splits=5)#ShuffleSplit,KFold
grid = GridSearchCV(estimator=model, param_grid=parameters, cv=kf, n_jobs=-1,scoring=make_scorer(score_func=__rmsle, greater_is_better=False))#n_jobs is important for time saving
grid.fit(x,y)
print grid.get_params()
'''
#apply when parameters are more than 3
kf = KFold(random_state=7,n_splits=5)#ShuffleSplit,KFold
model=GradientBoostingRegressor(n_estimators=100,random_state=7)
param_ids={'n_estimators':100,'max_depth':[1,2,3,4]}
grid = RandomizedSearchCV(estimator=model,param_distributions=param_ids,n_iter=100,random_state=7, cv=kf)
grid.fit(train[test_column_names], train[label_column_names])
print grid.best_score_, grid.best_params_
'''
return grid.best_score_, grid.best_params_
def my_save_model(train):
# Voting
models = []
models.append(('lr', LinearRegression()))
models.append(('svm', SVR()))
kf = KFold(random_state=7, n_splits=5) # ShuffleSplit,KFold
model = VotingClassifier(estimators=models)
result = cross_val_score(model, train[test_column_names], train[label_column_names], cv=kf)
print result.mean(), result.std() # accuracy
dump(model,'./test.model')
my_model = load('./test.model')
result = cross_val_score(my_model, train[test_column_names], train[label_column_names], cv=kf)
print result.mean(), result.std() # accuracy
def my_draw_learning_curve(estimator,X,y,train_sizes=np.linspace(.05,1.,20)):
if MY_PLOT_SHOW:
train_size,train_score,test_score=learning_curve(estimator,X,y,train_sizes=train_sizes, scoring=make_scorer(__rmsle,greater_is_better=False))
train_score_mean=np.mean(train_score,axis=1)
train_score_std=np.std(train_score,axis=1)
test_score_mean=np.mean(test_score,axis=1)
test_score_std=np.std(test_score,axis=1)
plt.figure()
plt.title('Learning Curve')
plt.xlabel('Number of training set')
plt.ylabel('Score')
plt.grid()
plt.fill_between(train_size,train_score_mean-train_score_std,train_score_mean+train_score_std,alpha=0.1,color='b')
plt.fill_between(train_size,test_score_mean-test_score_std,test_score_mean+test_score_std,alpha=0.1,color='r')
plt.plot(train_size,train_score_mean,'o-',color='b',label='Score in training set')
plt.plot(train_size,test_score_mean,'o-',color='r',label='Score in cv set')
plt.legend(loc='best')
plt.show()
midpoint = ((train_score_mean[-1]+train_score_std[-1]+test_score_mean[-1]-test_score_std[-1]))/2
diff = (train_score_mean[-1]+train_score_std[-1])-(test_score_mean[-1]-test_score_std[-1])
return midpoint,diff
MYROWS=1000
train_column_names = ['name', 'item_condition_id', 'category_name', 'brand_name', 'price', 'shipping', 'item_description']
test_column_names = ['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping','item_description']
label_column_names = ['price']
if __name__ =='__main__':
#1. read data
oringin_train = pd.read_table('./train.tsv', nrows=MYROWS)
oringin_test = pd.read_table('./test.tsv', nrows=MYROWS)
train = oringin_train[train_column_names].copy()
test = oringin_test[test_column_names].copy()
#==================================Feature Engineering Start========================================================
#2. understand data,can be called everywhere serveral times
my_print_datas(train)
#3. watch data again, draw data,can be called everywhere serveral times
my_draw_datas(train)
#4. feature_extraction,fill_missing_data,one-hot,labelcoder,tokenizer,padsequence
#all data to numberic
x_train_f, y_train_f, x_test_f=my_feature_extraction(train, test)
#5. preprocessing,standarize,scale,normalizer,minmaxselector
x_train_f, x_test_f=my_preprocessing_data(x_train_f, x_test_f)
#6. feature selection,K, feature_importance
X_train,y_train,X_test = my_feature_selection(x_train_f,y_train_f,x_test_f)
# ==================================Feature Engineering End=========================================================
#7. normal model selection, pipelines, gridsearch, crossvalidate
estimator1 = my_normal_model_selection(X_train,y_train)
#8. ensemble model selection, pipelines, gridsearch, crossvalidate
estimator2 = my_ensemble_model_selection(X_train,y_train)
model = estimator1
#9. draw leanring curve
my_draw_learning_curve(model,X_train,y_train)
#9. serialize
my_save_model(train)
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_TEXT = np.max([np.max(train.seq_name.max())
, np.max(test.seq_name.max())
, np.max(train.seq_item_description.max())
, np.max(test.seq_item_description.max())]) + 2
MAX_CATEGORY = np.max([train.category_name.max(), test.category_name.max()]) + 1
MAX_BRAND = np.max([train.brand_name.max(), test.brand_name.max()]) + 1
MAX_CONDITION = np.max([train.item_condition_id.max(), test.item_condition_id.max()]) + 1
#train["target"] = np.log(train.price + 1)
#target_scaler = MinMaxScaler(feature_range=(-1, 1))
#train["target"] = target_scaler.fit_transform(train.target.reshape(-1, 1))
#pd.DataFrame(train.target).hist()
print len(train)
dtrain, dvalid = train_test_split(train, random_state=123, train_size=RATIO)
x_train=fill_data(dtrain,MAX_NAME_SEQ,MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
y_train=dtrain.price
x_valid=fill_data(dvalid, MAX_NAME_SEQ, MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
y_valid = dvalid.price
x_test = fill_data(test, MAX_NAME_SEQ, MAX_ITEM_DESC_SEQ).filter(items=['name','item_desc','brand_name','category_name','item_condition_id','shipping'])
y_test = np.ones(len(x_test))
#
# result = do_predicting(name,x_test.as_matrix(), y_test)
# ans = test['test_id']
# ansD = ans.to_frame()
# other = pandas.DataFrame({'price':result})
# #print other
# ansD = ansD.join(other)
# #print ansD
# ansD.to_csv('./result.csv', columns=['test_id','price'], index = False)