from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np
defget_iris():
data=load_iris()
x=data['data']
y=data['target']
input_dataset=np.column_stack([x,y])
np.random.shuffle(input_dataset)
return input_dataset
#采用8/2分布分割数据集
data=get_iris()
train,test=train_test_split(data,train_size=0.8)
print("train size",train.shape)
print("test size ",test.shape)
# 检测训练集和测试集的类别标签是否分布合理defget_class_distribution(y):
distribution={}
set_y=set(y)
for y_label in set_y:
no_element=len(np.where(y==y_label)[0])
distribution[y_label]=no_element
return distribution
defprint_class_label_split(train,test):
y_train=train[:,-1]
train_distribution=get_class_distribution(y_train)
print("\nTrain data set class label distribution")
print("======================================\n")
for k ,v in train_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
y_test=test[:,-1]
test_distribution=get_class_distribution(y_test)
print("\nTest data set class label distribution")
print("======================================\n")
for k ,v in test_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
print_class_label_split(train,test)
train size (120, 5)
test size (30, 5)
Train data set class label distribution
======================================
class label=0, Percentage records=36.00
class label=1, Percentage records=42.00
class label=2, Percentage records=42.00
Test data set class label distribution
======================================
class label=0, Percentage records=14.00
class label=1, Percentage records=8.00
class label=2, Percentage records=8.00
如何在训练集和测试集中均匀的分割类别标签
from sklearn.cross_validation import StratifiedShuffleSplit
straified_split=StratifiedShuffleSplit(data[:,-1],test_size=0.2,n_iter=1)
for train_index,test_index in straified_split:
train=data[train_index]
test=data[test_index]
print_class_label_split(train,test)
Train data set class label distribution
======================================
class label=0, Percentage records=40.00
class label=1, Percentage records=40.00
class label=2, Percentage records=40.00
Test data set class label distribution
======================================
class label=0, Percentage records=10.00
class label=1, Percentage records=10.00
class label=2, Percentage records=10.00
P(class=positive|review)andP(class=negative|review) P ( c l a s s = p o s i t i v e | r e v i e w ) a n d P ( c l a s s = n e g a t i v e | r e v i e w )
P(positive|review)=P(review|positive)∗P(positive)P(review) P ( p o s i t i v e | r e v i e w ) = P ( r e v i e w | p o s i t i v e ) ∗ P ( p o s i t i v e ) P ( r e v i e w )
P(negative|review)=P(review|negative)∗P(negative)P(review) P ( n e g a t i v e | r e v i e w ) = P ( r e v i e w | n e g a t i v e ) ∗ P ( n e g a t i v e ) P ( r e v i e w )
要比较这两个等式来决定最终结果,我们可以忽略分母,因此它只是简单的缩放因此,等式左边称为后验概率。等式右边的分子部分P(review|positive)∗P(positive) P ( r e v i e w | p o s i t i v e ) ∗ P ( p o s i t i v e ) ,其中P(positive) P ( p o s i t i v e ) 是正面评价的先验概率,它是我们从训练集中获取的正面分类标签的分布信仰,我们通过下面的公式把它从训练集中计算出来:
P(positive)=NoofreviewswithpositiveclassllabelTotalreviewsinthecorpus P ( p o s i t i v e ) = N o o f r e v i e w s w i t h p o s i t i v e c l a s s l l a b e l T o t a l r e v i e w s i n t h e c o r p u s
P(review|postive) P ( r e v i e w | p o s t i v e ) 是一种可能性,它回答了这个问题:给定这个类别是正面的,这个评价是正面的可能性有多大
#加载库和生成数据from nltk.corpus import movie_reviews,stopwords
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures
defget_data():
dataset=[]
ylabel=[]
for cat in movie_reviews.categories():
for field in movie_reviews.fileids(cat):
words=list(movie_reviews.words(field))
dataset.append((words,cat))
ylabel.append(cat)
return dataset,ylabel
defget_train_test(input_dataset,ylabel):
stragiht_split=StratifiedShuffleSplit(ylabel,train_size=0.8,n_iter=1)
for train_index,test_index in stragiht_split:
train=[input_dataset[i] for i in train_index]
train_y=[ylabel[i] for i in train_index]
test=[input_dataset[i] for i in test_index]
test_y=[ylabel[i] for i in test_index]
return train,train_y,test,test_y
#模型构建defbuild_word_features(instance):
feature_set={}
words=instance[0]
for word in words:
feature_set[word]=1return (feature_set,instance[1])
defbuild_negate_features(instance):
words= instance[0]
final_words=[]
negate=False
negate_words=['no','not']
for word in words:
if negate:
word='not_'+word
negate=Falseif word notin negate_words:
final_words.append(word)
else:
negate=True
feature_set={}
for word in final_words:
feature_set[word]=1return (feature_set,instance[1])
defremove_stop_words(in_data):
stopwordlist=stopwords.words('english')
negate_words=['no','not']
New_stopwords=[word for word in stopwordlist if word notin negate_words]
label=in_data[1]
words=[word for word in in_data[0] if word notin New_stopwords]
return (words,label)
defbuild_keyphrase_features(instance):
feature_set={}
instance=remove_stop_words(instance)
words=instance[0]
bigram_finder=BigramCollocationFinder.from_words(words)
bigrams=bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
for bigram in bigrams:
feature_set[bigram]=1return (feature_set,instance[1])
defbuild_model(features):
model=nltk.NaiveBayesClassifier.train(features)
return model
defprobel_model(model,features,dataset_type='Train'):
accuracy=nltk.classify.accuracy(model,features)
print("\n"+ dataset_type+"Accuracy=%0.2f"%(accuracy*100)+"%")
defshow_features(model,features=5):
print("\nFeature Importance")
print("=====================")
print(model.show_most_informative_features(features))
#模型调整defbuild_model_cycle_1(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
defbuild_model_cycle_2(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
defbuild_model_cycle_3(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
#主程序
input_dataset,ylabels=get_data()
train_data,train_y,ALL_test_data,ALL_test_y=get_train_test(input_dataset,ylabels)
dev_data,dev_y,test_data,test_y=get_train_test(ALL_test_data,ALL_test_y)
print("\n Origin Data Size=",len(input_dataset))
print("\n Train Data Size=",len(train_data))
print("\n Dev Data Size=",len(dev_data))
print("]n Test Data Size=",len(test_data))
model_cycle_1=build_model_cycle_1(train_data,dev_data)
show_features(model_cycle_1)
model_cycle_2=build_model_cycle_2(train_data,dev_data)
show_features(model_cycle_2)
model_cycle_3=build_model_cycle_3(train_data,dev_data)
show_features(model_cycle_3)
Microsoft Windows [版本 10.0.16299.371]
(c) 2017 Microsoft Corporation。保留所有权利。
E:\PycharmProjects\JupyterFiles\python>dot -Tpdf tree.dot -o tree.pdf
E:\PycharmProjects\JupyterFiles\python>
∑i=1n(yi−w0−∑j=1mxijwij)2+α∑j=1m|w2j| ∑ i = 1 n ( y i − w 0 − ∑ j = 1 m x i j w i j ) 2 + α ∑ j = 1 m | w j 2 |
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso,LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
defget_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
defbuild_model(x,y):
alpha_range=np.linspace(0,0.5,200)
model=Lasso(normalize=True)
coeffiecients=[]
for alpha in alpha_range:
model.set_params(alpha=alpha)
model.fit(x,y)
coeffiecients.append(model.coef_)
coeff_path(alpha_range,coeffiecients)
defmodel_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
defmodel_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
defcoeff_path(alpha_range,coeffiecients):
plt.figure(figsize=(16,9))
plt.xlabel("alpha value")
plt.ylabel("coeffiecient weight")
plt.plot(alpha_range,coeffiecients)
plt.axis('tight')
defget_coeff(x,y,alpha):
model=Lasso(normalize=True,alpha=alpha)
model.fit(x,y)
coefs=model.coef_
indices=[i for i,coef in enumerate(coefs) if abs(coef)>0.0]
return indices
x,y=get_data()
build_model(x,y)
print("\nPredicting using all the variables")
full_model=LinearRegression(normalize=True)
full_model.fit(x,y)
predicted_y=full_model.predict(x)
model_worth(y,predicted_y)
print("\nModels at different alpha values")
alpha_values=[0.22,0.08,0.01]
for alpha in alpha_values:
indices=get_coeff(x,y,alpha)
print("\t Attributes include",indices)
x_new=x[:,indices]
model=LinearRegression(normalize=True)
model.fit(x_new,y)
predicted_y=model.predict(x_new)
model_worth(y,predicted_y)
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator
C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:477: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged.
positive)
C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
ConvergenceWarning)
Predicting using all the variables
Mean squared error = 21.90
Models at different alpha values
Attributes include [5, 12]
Mean squared error = 30.51
Attributes include [5, 10, 12]
Mean squared error = 27.13
Attributes include [0, 1, 3, 4, 5, 7, 10, 11, 12]
Mean squared error = 22.89
from sklearn.datasets import load_iris
from sklearn.cross_validation import KFold,StratifiedKFold
defget_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
defclass_distribution(y):
class_dist={}
total=0for entry in y:
try:
class_dist[entry]+=1except KeyError:
class_dist[entry]=1
total+=1for k,v in class_dist.items():
print('\tclass %d percentage = %0.2f'%(k,v/(1.0*total)))
#第一种分法
x,y=get_data()
kfolds=KFold(n=y.shape[0],n_folds=3)
fold_count=1for train,test in kfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1#第二种分法
skfolds=StratifiedKFold(y,n_folds=3)
fold_count=1for train,test in skfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1
Fold 1 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 1 percentage = 0.50
class 2 percentage = 0.50
Test Class Distribution
class 0 percentage = 1.00
Fold 2 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 0 percentage = 0.50
class 2 percentage = 0.50
Test Class Distribution
class 1 percentage = 1.00
Fold 3 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 0 percentage = 0.50
class 1 percentage = 0.50
Test Class Distribution
class 2 percentage = 1.00
Fold 1 x train shape (99, 4) x test shape (51, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Fold 2 x train shape (99, 4) x test shape (51, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Fold 3 x train shape (102, 4) x test shape (48, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
defget_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
defbuild_model(x,y):
kfold=KFold(y.shape[0],5)
model=Ridge(normalize=True)
alpha_range=np.linspace(0.0015,0.0017,30)
grid_param={'alpha':alpha_range}
grid=GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,scoring='mean_squared_error')
grid.fit(x,y)
display_param_results(grid.grid_scores_)
print(grid.best_params_)
return grid.best_estimator_
defview_model(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
defmodel_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
defdisplay_param_results(param_results):
fold=1for param_result in param_results:
print("Fold %d mean squared error %0.2f" %(fold,abs(param_result[1])),param_result[0])
fold+=1
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
defget_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
defbuild_single_model(x,y):
model=KNeighborsClassifier()
model.fit(x,y)
return model
defbagging_model(x,y):
bagging=BaggingClassifier(KNeighborsClassifier(),n_estimators=100,random_state=9,max_samples=1.0,max_features=0.7,
bootstrap=True,bootstrap_features=True)
bagging.fit(x,y)
return bagging
defview_model(model):
print("\n Sampled attributes in top 10 estimators \n")
for i,features in enumerate(model.estimators_features_[0:10]):
print("estimator %d"%(i+1),features)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
#构建多个模型
bagging=bagging_model(x_train,y_train)
predicted_y=bagging.predict(x_train)
print("\n Bagging Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
view_model(bagging)
#查看Dev集的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
predicted_y=bagging.predict(x_dev)
print("\n Bagging Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,zero_one_loss
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import itertools
defget_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
defbuild_single_model(x,y):
model=DecisionTreeClassifier()
model.fit(x,y)
return model
defbuild_boosting_model(x,y,n_estimators=20):
boosting=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,min_samples_leaf=1),
random_state=9,n_estimators=n_estimators,algorithm="SAMME")
boosting.fit(x,y)
return boosting
defview_model(model):
print("\n Estimator Weights and Error\n")
for i,weight in enumerate(model.estimator_weights_):
print("estimator %d weight =%0.4f error= %0.4f"%(i+1,weight,model.estimator_errors_[i]))
plt.plot(model.estimator_weights_,model.estimator_errors_)
defnumber_estimators_vs_err_rate(x,y,x_dev,y_dev):
no_estimators=range(20,120,10)
misclassy_rate=[]
misclassy_rate_dev=[]
for no_estimator in no_estimators:
boosting=build_boosting_model(x,y,no_estimator)
predicted_y=boosting.predict(x)
predicted_y_dev=boosting.predict(x_dev)
misclassy_rate.append(zero_one_loss(y,predicted_y))
misclassy_rate_dev.append(zero_one_loss(y_dev,predicted_y_dev))
# no_estimators=np.asarray(no_estimators)# misclassy_rate=np.asarray(misclassy_rate)# misclassy_rate_dev=np.asarray(misclassy_rate_dev)# print(no_estimators,misclassy_rate)
plt.plot(no_estimators,misclassy_rate,label='Train',color='g')
plt.plot(no_estimators,misclassy_rate_dev,label="Dev",color='r')
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassfication = %0.2f" %(zero_one_loss(y_train,predicted_y)*100,),'%')
#构建多个模型
boosting=build_boosting_model(x_train,y_train)
predicted_y=boosting.predict(x_train)
print("\n Boosting Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_train,predicted_y)*100),"%")
view_model(boosting)
#查看在Dev集上的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
predicted_y=boosting.predict(x_dev)
print("\n Boosting Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
number_estimators_vs_err_rate(x_train,y_train,x_dev,y_dev)
Single Model Accuracy on training data
precision recall f1-score support
0 1.00 1.00 1.00 181
1 1.00 1.00 1.00 169
avg / total 1.00 1.00 1.00 350
Fraction of misclassfication = 0.00 %
Boosting Model Accuracy on training data
precision recall f1-score support
0 0.86 0.94 0.90 181
1 0.93 0.84 0.88 169
avg / total 0.89 0.89 0.89 350
Fraction of misclassification = 10.86 %
Estimator Weights and Error
estimator 1 weight =0.8337 error= 0.3029
estimator 2 weight =0.8921 error= 0.2907
estimator 3 weight =0.6730 error= 0.3378
estimator 4 weight =0.6067 error= 0.3528
estimator 5 weight =0.5746 error= 0.3602
estimator 6 weight =0.5537 error= 0.3650
estimator 7 weight =0.5697 error= 0.3613
estimator 8 weight =0.5538 error= 0.3650
estimator 9 weight =0.5579 error= 0.3640
estimator 10 weight =0.4530 error= 0.3886
estimator 11 weight =0.4530 error= 0.3886
estimator 12 weight =0.3564 error= 0.4118
estimator 13 weight =0.4130 error= 0.3982
estimator 14 weight =0.3679 error= 0.4091
estimator 15 weight =0.3142 error= 0.4221
estimator 16 weight =0.3888 error= 0.4040
estimator 17 weight =0.4902 error= 0.3799
estimator 18 weight =0.2798 error= 0.4305
estimator 19 weight =0.4463 error= 0.3902
estimator 20 weight =0.2645 error= 0.4343
Single Model Accuracy on Dev data
precision recall f1-score support
0 0.62 0.75 0.68 51
1 0.70 0.57 0.63 54
avg / total 0.66 0.66 0.65 105
Fraction of misclassification = 34.29 %
Boosting Model Accuracy on Dev data
precision recall f1-score support
0 0.71 0.86 0.78 51
1 0.84 0.67 0.74 54
avg / total 0.78 0.76 0.76 105
Fraction of misclassification = 23.81 %
#!/bin/bash
#
# Script to start LVS DR real server.
# description: LVS DR real server
#
#. /etc/rc.d/init.d/functions
VIP=10.10.6.252
host='/bin/hostname'
case "$1" in
sta
大多数java开发者使用的都是eclipse,今天感兴趣去eclipse官网搜了一下eclipse.ini的配置,供大家参考,我会把关键的部分给大家用中文解释一下。还是推荐有问题不会直接搜谷歌,看官方文档,这样我们会知道问题的真面目是什么,对问题也有一个全面清晰的认识。
Overview
1、Eclipse.ini的作用
Eclipse startup is controlled by th
import java.util.Arrays;
/**
* 最早是在陈利人老师的微博看到这道题:
* #面试题#An array with n elements which is K most sorted,就是每个element的初始位置和它最终的排序后的位置的距离不超过常数K
* 设计一个排序算法。It should be faster than O(n*lgn)。
原网页被墙,放这里备用。 MySQLdb User's Guide
Contents
Introduction
Installation
_mysql
MySQL C API translation
MySQL C API function mapping
Some _mysql examples
MySQLdb