机器学习模型上线主要分为两种方式,一种是基于HTTP服务方式,这种方式需要在生产环境上部署相应的python环境以及相应的python机器学习包,这种方式好处在于将数据预处理部分和模型部分(保存成pkl文件)都能一起部署在HTTP服务中,缺点则是需要在生产上部署相应的python环境;而另一种方式则是通过PMML方式将机器学习模型打包给java环境使用,这种方法好处在于能使生产环境脱离python环境,只需要有java环境即可,但是最大的缺点就是不能将数据预处理部分自己编写的数据处理函数打包成PMML文件,因为PMML文件是用sklearn2pmml导出的,因此它只认识sklearn家族中的函数和模型算法。
#
#!coding=UTF-8
from http.server import HTTPServer,BaseHTTPRequestHandler
import io,shutil,urllib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def companyinformationinput(companyname,entType,termDuration,startDuration,numberEmployers,numberEmployees,numberBranches,ratioProvident):
import numpy as np
import pandas as pd
try:
predData=pd.DataFrame()
predData['企业名称(entName)']=[companyname]
predData['企业类型(entType)']=[entType]
predData['营业时长']=[int(termDuration)]
predData['成立时长']=[int(startDuration)]
predData['所有股东人数(employees)']=[int(numberEmployers)]
predData['主要人员数(employees)']=[int(numberEmployees)]
predData['分支机构数(branches)']=[int(numberBranches)]
predData['单位比例']=[float(ratioProvident)]
#读取自定义词典
f = open('word2index.txt','r')
word2index = f.read()
word2index = eval(word2index)
f.close()
word2index
#对企业类型编码,若企业类型从未出现过,则编码为0
predData['企业类型(entType)']=[word2index[predData['企业类型(entType)'][i]] if predData['企业类型(entType)'][i] in word2index.keys() else 0 for i in range(len(predData['企业类型(entType)']))]
predData1=pd.DataFrame(predData,columns=['企业类型(entType)', '营业时长', '成立时长', '所有股东人数(employees)','主要人员数(employees)', '分支机构数(branches)', '单位比例'])
#导入保存好的模型
from sklearn.externals import joblib
classifier_RandomForest=joblib.load('classifier_RandomForestModelwordsvectornotin.pkl')
#预测结果
dictionnary={1:'A',2:'B',3:'C',4:'D'}
y_pred=pd.Series(classifier_RandomForest.predict(predData1)).map(dictionnary)
y_pred=y_pred[0]
except:
# y_pred=['无此信息,请手工查询']*len(predData['企业名称(entName)'])
y_pred='NULL'
# aaa=[]
# for i in range(len(predData['企业名称(entName)'])):
## aaa.append('您所查询的企业:'+predData['企业名称(entName)'][i]+'类型为:'+y_pred[i]+'(仅供参考,如有疑问请手工查询)')
# aaa.append(y_pred[i])
return y_pred
#def randomforest(a,b):
# #模型加载
# X_test=np.arra
from sklearn_pandas import DataFrameMapper, cross_val_score
import sklearn.decomposition
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import xgboost as xgb
import random
import os
from datetime import date,datetime
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler as SS
from sklearn.preprocessing import MinMaxScaler as MM
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer as LB
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib
from sklearn_pandas import DataFrameMapper
from sklearn2pmml import PMMLPipeline
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml import sklearn2pmml
import sys
sys.path.append('E:\Java')
sys.path.append('E:\Java\jdk1')
sys.path
heart_data=pd.read_csv("heart.csv")
class A:
def fit(self,X,params):
pass
def transform(self,X):
return 3*X
pipeline2 = PMMLPipeline([("test", A())])
pipeline2.fit(2)
pipeline2.transform(2)
#aa=A()
#aa.fit(5,10)
#aa.transform(5)
#用Mapper定义特征工程
mapper = DataFrameMapper([
(['sbp'], MM()),
(['tobacco'], MM()),
('ldl', None),
('adiposity', None),
# (['famhist'], LB()),
('typea', None),
('obesity', None),
('alcohol', None),
(['age'], FunctionTransformer(np.log)),
])
#用pipeline定义使用的模型,特征工程等
pipeline = PMMLPipeline([
('mapper', mapper),
# ("classifier", RandomForestClassifier())
])
pipeline.fit(heart_data[heart_data.columns.difference(["chd","famhist"])])
aaa=pipeline.transform(heart_data[heart_data.columns.difference(["chd","famhist"])])
sklearn2pmml(pipeline,"aaa.pmml",with_repr=True)
pipeline1 = PMMLPipeline([
# ('mapper', mapper),
("classifier", RandomForestClassifier())
])
pipeline1.fit(heart_data[heart_data.columns.difference(["chd","famhist"])],heart_data["chd"])
aaa=pipeline1.predict(heart_data[heart_data.columns.difference(["chd","famhist"])])
mapper.transformed_names_
sklearn2pmml(pipeline1,"bbb.pmml",with_repr=True)
在上面代码代码中,不能对自己定义的数据预处理函数进行sklearn2pmml的输出,因为sklearn2pmml是sklearn中的家族,所以不认识上面mapper数据预处理中自己定义的A类中的函数,如果是sklearn下面的一些数据预处理函数,它可以导出为pmml文件;如果不考虑导出pmml文件,只是对数据在python环境下进行离线处理,则可以用自定义的A类函数并嵌入在mapper和pipeline中使用。