关键词:机器学习,python,scikit-learn,逻辑回归,Latex
摘要:本文主要介绍了分类任务的分类,逻辑回归的概念,以及分类评估方法;同时介绍了如何使用python对分类任务进行建模、预测结果以及结果评价,以及使用GridSearch方法进行优化;
注:LaTeX常见命令请参考wikibooks,笔者将其pdf上传到csdn下载中。
print
输出文字+数字的三种方法:{}
、%d
和,
print('spam:',df[df[0]=='spam'][0].count())
#spam: 747
print('spam:%d' %(df[df[0]=='spam'][0].count()))
spam:747
print('spam:{}' .format(df[df[0]=='spam'][0].count()))
spam:747
y_test_binary=[0]*y_test.shape[0]
predictions_binary=[0]*predictions.shape[0]
for ind,val in enumerate(y_test):
if val=='spam':
y_test_binary[ind]=1
for ind,val in enumerate(predictions):
if val=='spam':
predictions_binary[ind]=1
Matplotlib
库confusion_matrix
绘制成matshow()
,添加colorbar()
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
plt.matshow(cm)
plt.colorbar()
import matplotlib.pyplot as plt
plt.plot([0,1],[0,1],label='AUC')
plt.legend(loc='lower right')
Numpy
库arange
和linsapce
,前者是从起点按照给定步长进行划分,只有当终点也在步长整数倍时才会被包含在内;后者是将起点和终点中间等距划分,终点位最后一位数;X=np.arange(-6,6,5);X
# 输出array([-6, -1, 4])
X=np.arange(-6,6,1);X
# 输出array([-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
X=np.linspace(-6,6,5);X
# 输出 array([-6., -3., 0., 3., 6.])
Pandas
库==
统计某列含某个值的数量print('含ham短信数量��', df[df[0] == 'ham'][0].count())
skilearn
库train_size
改变训练集和测试集的比例X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5)
TfdifVectorizer
计算TF-IDF权重from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
LogisticRegression
分类器进行训练和分类from sklearn.linear_model.logistic import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
import pandas as pd
df=pd.read_csv('Desktop/data/SMSSpamCollection',delimiter='\t',header=None)
print('count of spam message=\t',df[df[0]=='spam'][0].count())
print('count of ham message=\t',df[df[0]=='ham'][0].count())
train_test_split
分成训练集(75%)和测试集(25%)from sklearn.cross_validation import train_test_split
X_train_raw,X_test_raw,y_train,y_test=train_test_split(df[1],df[0])
TfidfVectorizer
实例来计算输入信息的TF-IDF权重��;from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train_raw)
X_test=vectorizer.transform(X_test_raw)
LogisticRegression
分类器,来训练fit()
和预测predict()
模型;from sklearn.linear_model import LogisticRegression
classifer=LogisticRegression()
classifer.fit(X_train,y_train)
predictions = classifer.predict(X_test)
for i,prediction in enumerate(predictions[-10:]):
print('predict-tpye=\t %s. message=\t %s.'%(prediction,X_test_raw.iloc[i]))
y_test
和predictions
转化为0、1组成的list;y_test_binary=[0]*y_test.shape[0]
predictions_binary=[0]*predictions.shape[0]
for ind,val in enumerate(y_test):
if val=='spam':
y_test_binary[ind]=1
for ind,val in enumerate(predictions):
if val=='spam':
predictions_binary[ind]=1
print(y_test[-10:])
print(predictions[-10:])
print(y_test_binary[-10:])
print(predictions_binary[-10:])
from sklearn.metrics import confusion_matrix
%matplotlib inline
import matplotlib.pyplot as plt
cf_mtx=confusion_matrix(y_test_binary,predictions_binary)
print(cf_mtx)
plt.matshow(cf_mtx);\
plt.title('confusion matrix(spam=1)');\
plt.colorbar();\
plt.ylabel('actual type');\
plt.xlabel('predict type');\
plt.show()
accuracy_score
,精确率precision_score
,召回率recall_score
,综合评价指标f1_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('accuracy=\t',accuracy_score(y_test_binary,predictions_binary));\
print('precision_score=\t',precision_score(y_test_binary,predictions_binary));\
print('recall_score=\t',recall_score(y_test_binary,predictions_binary));\
print('f1_score=\t',f1_score(y_test_binary,predictions_binary));\
'''
accuracy= 0.969849246231
precision_score= 0.984375
recall_score= 0.759036144578
f1_score= 0.857142857143
'''
accuracy_score
,精确率precision_score
,召回率recall_score
,综合评价指标f1_score
#restart kernel ctrl+.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score
df=pd.read_csv('Desktop/data/sms.csv')
X_train_raw,X_test_raw,y_train,y_test=train_test_split(df['message'],df['label'])
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train_raw)
X_test=vectorizer.transform(X_test_raw)
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
accuracy_score=cross_val_score(classifier, X_train, y_train, cv=5)
precision_score = cross_val_score(classifier, X_train, y_train, cv=5, scoring= 'precision')
recall_score = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
f1_score=cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
print('accuracy=\t',np.mean(accuracy_score),accuracy_score)
print('precisions=\t',np.mean(precision_score),precision_score)
print('recall=\t',np.mean(recall_score),recall_score)
print('f1=\t',np.mean(f1_score),f1_score)
'''
accuracy= 0.955502340314 [ 0.96535245 0.94384707 0.95933014 0.9497006 0.95928144]
precisions= 0.989871815161 [ 0.97752809 0.97183099 1. 1. 1. ]
recall= 0.67899394504 [ 0.76315789 0.60526316 0.69911504 0.62831858 0.69911504]
f1= 0.804132253371 [ 0.85714286 0.74594595 0.82291667 0.77173913 0.82291667]
'''
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc
%matplotlib inline
predictions=classifier.predict_proba(X_test)
false_positive_rate,recall,thresholds=roc_curve(y_test,predictions[:,1])
roc_auc=auc(false_positive_rate,recall)
plt.title('Reciver Operating Characteristic');\
plt.plot(false_positive_rate,recall,'r',label='AUC=%0.2f' %roc_auc);\
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'k--');\
plt.xlim([0.0,1.0]);\
plt.ylim([0.0,1.0]);\
plt.ylabel('Recall');\
plt.xlabel('Fall-out');\
plt.show()
#RESTART KERNEL
import pandas as pd
from sklearn.cross_validation import train_test_split
df=pd.read_csv('Desktop/data/sms.csv')
X,y=df['message'],df['label']
X_train,X_test,y_train,y_test=train_test_split(X,y)
GridSearchCV
参数,求最优参数from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression())])
parameters = {'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False),'vect__norm': ('l1', 'l2'),'clf__penalty': ('l1', 'l2'),'clf__C': (0.01, 0.1, 1, 10),}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)
'''output:
Fitting 3 folds for each of 1536 candidates, totalling 4608 fits
[Parallel(n_jobs=-1)]: Done 52 tasks | elapsed: 1.7s
[Parallel(n_jobs=-1)]: Done 352 tasks | elapsed: 11.2s
[Parallel(n_jobs=-1)]: Done 852 tasks | elapsed: 32.1s
[Parallel(n_jobs=-1)]: Done 1552 tasks | elapsed: 1.0min
[Parallel(n_jobs=-1)]: Done 2156 tasks | elapsed: 1.5min
[Parallel(n_jobs=-1)]: Done 2706 tasks | elapsed: 1.9min
[Parallel(n_jobs=-1)]: Done 3356 tasks | elapsed: 2.4min
[Parallel(n_jobs=-1)]: Done 4106 tasks | elapsed: 4.1min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed: 4.6min finished
'''
print('best-accuracy=\t',grid_search.best_score_);
print('best paras combination');
best_paras=grid_search.best_estimator_.get_params();
for para_name in sorted(parameters.keys()):
print('\t%s=\t%r'%(para_name,best_paras[para_name]))
predictions=grid_search.predict(X_test)
print('accuracy=',accuracy_score(y_test,predictions))
print('precision=',precision_score(y_test,predictions))
print('recall=',recall_score(y_test,predictions))
'''
best-accuracy= 0.984210526316
best paras combination
clf__C= 10
clf__penalty= 'l2'
vect__max_df= 0.25
vect__max_features= None
vect__ngram_range= (1, 2)
vect__norm= 'l2'
vect__stop_words= None
vect__use_idf= True
accuracy= 0.982065997131
precision= 0.977272727273
recall= 0.891191709845
'''
import pandas as pd
df=pd.read_csv('Desktop/Sentiment Analysis on Movie Reviews/train.tsv',header=0,delimiter='\t')
df.head()
df.count()
df.Phrase.head()
df.Sentiment.describe()
df.Sentiment.value_counts()
df.Sentiment.value_counts()/df.Sentiment.count()
from sklearn.cross_validation import train_test_split
X,y=df['Phrase'],df['Sentiment'].as_matrix()
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),('clf', LogisticRegression())])
paras={'vect__max_df':(0.25,0.5),'vect__ngram_range':((1,1),(1,2)),'vect__use_idf':(True,False),'clf__C':(0.1,1,10)}
grid_search=GridSearchCV(pipeline,paras,n_jobs=3,verbose=1,scoring='accuracy')
grid_search.fit(X_train,y_train)
print('best accuracy:=',grid_search.best_score_)
best_paras = grid_search.best_estimator_.get_params()
print('best paras:');
for para_name in sorted(paras.keys()):
print('\t%s: %r' %(para_name,best_paras[para_name]));
'''
best accuracy:= 0.619735998975
best paras:
clf__C: 10
vect__max_df: 0.25
vect__ngram_range: (1, 2)
vect__use_idf: False
'''
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
predictions=grid_search.predict(X_test)
print('accuracy=',accuracy_score(y_test,predictions));
print('confusion matrix=',confusion_matrix(y_test,predictions))
print('report=',classification_report(y_test,predictions))
'''
accuracy= 0.635101883891
confusion matrix= [[ 1165 1680 682 67 10]
[ 894 5990 6175 557 39]
[ 199 3219 32596 3611 162]
[ 25 424 6562 8117 1248]
[ 2 33 502 2382 1689]]
report= precision recall f1-score support
0 0.51 0.32 0.40 3604
1 0.53 0.44 0.48 13655
2 0.70 0.82 0.76 39787
3 0.55 0.50 0.52 16376
4 0.54 0.37 0.44 4608
avg / total 0.62 0.64 0.62 78030
'''