机器学习sklearn的模型上线部署工作总结

一、机器学习模型上线两种方式

机器学习模型上线主要分为两种方式,一种是基于HTTP服务方式,这种方式需要在生产环境上部署相应的python环境以及相应的python机器学习包,这种方式好处在于将数据预处理部分和模型部分(保存成pkl文件)都能一起部署在HTTP服务中,缺点则是需要在生产上部署相应的python环境;而另一种方式则是通过PMML方式将机器学习模型打包给java环境使用,这种方法好处在于能使生产环境脱离python环境,只需要有java环境即可,但是最大的缺点就是不能将数据预处理部分自己编写的数据处理函数打包成PMML文件,因为PMML文件是用sklearn2pmml导出的,因此它只认识sklearn家族中的函数和模型算法。

二、HTTP服务上线脚本

#
#!coding=UTF-8    
from http.server import HTTPServer,BaseHTTPRequestHandler    
import io,shutil,urllib    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def companyinformationinput(companyname,entType,termDuration,startDuration,numberEmployers,numberEmployees,numberBranches,ratioProvident):
    import numpy as np
    import pandas as pd
    try:
        predData=pd.DataFrame()
        predData['企业名称(entName)']=[companyname]
        predData['企业类型(entType)']=[entType]
        predData['营业时长']=[int(termDuration)]
        predData['成立时长']=[int(startDuration)]
        predData['所有股东人数(employees)']=[int(numberEmployers)]
        predData['主要人员数(employees)']=[int(numberEmployees)]
        predData['分支机构数(branches)']=[int(numberBranches)]
        predData['单位比例']=[float(ratioProvident)]
        #读取自定义词典
        f = open('word2index.txt','r')
        word2index = f.read()
        word2index = eval(word2index)
        f.close()
        word2index
        #对企业类型编码,若企业类型从未出现过,则编码为0
        predData['企业类型(entType)']=[word2index[predData['企业类型(entType)'][i]] if predData['企业类型(entType)'][i] in word2index.keys() else 0 for i in range(len(predData['企业类型(entType)']))]
        predData1=pd.DataFrame(predData,columns=['企业类型(entType)', '营业时长', '成立时长', '所有股东人数(employees)','主要人员数(employees)', '分支机构数(branches)', '单位比例'])
        #导入保存好的模型
        from sklearn.externals import joblib
        classifier_RandomForest=joblib.load('classifier_RandomForestModelwordsvectornotin.pkl')
        #预测结果
        dictionnary={1:'A',2:'B',3:'C',4:'D'}
        y_pred=pd.Series(classifier_RandomForest.predict(predData1)).map(dictionnary)
        y_pred=y_pred[0]
    except:
#        y_pred=['无此信息,请手工查询']*len(predData['企业名称(entName)'])
        y_pred='NULL'
#    aaa=[]
#    for i in range(len(predData['企业名称(entName)'])):
##        aaa.append('您所查询的企业:'+predData['企业名称(entName)'][i]+'类型为:'+y_pred[i]+'(仅供参考,如有疑问请手工查询)')
#        aaa.append(y_pred[i])
    return y_pred


#def randomforest(a,b):
#    #模型加载
#    X_test=np.arra

三、PMML文件打包上线脚本

from sklearn_pandas import DataFrameMapper, cross_val_score
import sklearn.decomposition 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd 
import numpy as np 
import xgboost as xgb 
import random 
import os 
from datetime import date,datetime 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.metrics import roc_curve,roc_auc_score 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler as SS 
from sklearn.preprocessing import MinMaxScaler as MM 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer as LB 
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib 
from sklearn_pandas import DataFrameMapper
from sklearn2pmml import PMMLPipeline
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml import sklearn2pmml

import sys
sys.path.append('E:\Java')
sys.path.append('E:\Java\jdk1')
sys.path


heart_data=pd.read_csv("heart.csv")


class A:
    def fit(self,X,params):
        pass
    def transform(self,X):
        return 3*X



pipeline2 = PMMLPipeline([("test", A())])
pipeline2.fit(2)
pipeline2.transform(2)


#aa=A()
#aa.fit(5,10)
#aa.transform(5)



#用Mapper定义特征工程
mapper = DataFrameMapper([
    (['sbp'], MM()),
    (['tobacco'], MM()),
    ('ldl', None),
    ('adiposity', None),
#    (['famhist'], LB()),
    ('typea', None),
    ('obesity', None),
    ('alcohol', None),
    (['age'], FunctionTransformer(np.log)),
]) 

#用pipeline定义使用的模型,特征工程等
pipeline = PMMLPipeline([
   ('mapper', mapper),
#   ("classifier", RandomForestClassifier())
])


pipeline.fit(heart_data[heart_data.columns.difference(["chd","famhist"])])
aaa=pipeline.transform(heart_data[heart_data.columns.difference(["chd","famhist"])])

sklearn2pmml(pipeline,"aaa.pmml",with_repr=True)  


pipeline1 = PMMLPipeline([
#   ('mapper', mapper),
   ("classifier", RandomForestClassifier())
])

pipeline1.fit(heart_data[heart_data.columns.difference(["chd","famhist"])],heart_data["chd"])
aaa=pipeline1.predict(heart_data[heart_data.columns.difference(["chd","famhist"])])

mapper.transformed_names_

sklearn2pmml(pipeline1,"bbb.pmml",with_repr=True)  

在上面代码代码中,不能对自己定义的数据预处理函数进行sklearn2pmml的输出,因为sklearn2pmml是sklearn中的家族,所以不认识上面mapper数据预处理中自己定义的A类中的函数,如果是sklearn下面的一些数据预处理函数,它可以导出为pmml文件;如果不考虑导出pmml文件,只是对数据在python环境下进行离线处理,则可以用自定义的A类函数并嵌入在mapper和pipeline中使用。

你可能感兴趣的:(Python学习,机器学习)