初试Alink

from pyalink.alink import *
Use one of the following commands to start using PyAlink:
 - useLocalEnv(parallelism, flinkHome=None, config=None)
 - useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.
useLocalEnv(2, flinkHome=None, config=None)
JVM listening on 127.0.0.1:38737





MLEnv(benv=JavaObject id=o2, btenv=JavaObject id=o5, senv=JavaObject id=o3, stenv=JavaObject id=o6)
source = CsvSourceBatchOp()\
    .setSchemaStr("sepal_length double, sepal_width double, petal_length double, petal_width double, category string")\
    .setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv")
res = source.select(["sepal_length", "sepal_width"])
df = res.collectToDataframe()
print(df)
     sepal_length  sepal_width
0             5.0          3.2
1             6.6          3.0
2             5.4          3.9
3             5.0          2.3
4             5.5          3.5
..            ...          ...
145           6.4          2.9
146           6.3          2.5
147           5.8          2.6
148           5.7          4.4
149           6.5          3.0

[150 rows x 2 columns]
df.head(1)
sepal_length sepal_width
0 5.0 3.2


# set env 
from pyalink.alink import *
import sys, os
resetEnv()
useLocalEnv(2)


MLEnv(benv=JavaObject id=o178, btenv=JavaObject id=o181, senv=JavaObject id=o179, stenv=JavaObject id=o182)


# schema of train data
schemaStr = "id string, click string, dt string, C1 string, banner_pos int, site_id string, \
            site_domain string, site_category string, app_id string, app_domain string, \
            app_category string, device_id string, device_ip string, device_model string, \
            device_type string, device_conn_type string, C14 int, C15 int, C16 int, C17 int, \
            C18 int, C19 int, C20 int, C21 int"
# prepare batch train data
batchTrainDataFn = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv"
trainBatchData = CsvSourceBatchOp().setFilePath(batchTrainDataFn) \
        .setSchemaStr(schemaStr) \
        .setIgnoreFirstLine(True)

trainBatchData

labelColName = "click"
vecColName = "vec"
nameHashFeatures = 30000
selectedColNames =["C1","banner_pos","site_category","app_domain",
                  "app_category","device_type","device_conn_type", 
                  "C14","C15","C16","C17","C18","C19","C20","C21",
                   "site_id","site_domain","device_id","device_model"]
categoryColNames = ["C1","banner_pos","site_category","app_domain", 
                    "app_category","device_type","device_conn_type",
                    "site_id","site_domain","device_id","device_model"]

numericalColNames = [“C14”,“C15”,“C16”,“C17”,“C18”,“C19”,“C20”,“C21”]

# prepare stream train data
wholeDataFile = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-ctr-train-8M.csv"
data = CsvSourceStreamOp() \
        .setFilePath(wholeDataFile) \
        .setSchemaStr(schemaStr) \
        .setIgnoreFirstLine(True);

# split stream to train and eval data
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data)

train_stream_data = spliter
test_stream_data = spliter.getSideOutput(0)



# setup feature enginerring pipeline
feature_pipeline = Pipeline() \
        .add(StandardScaler() \
                .setSelectedCols(numericalColNames)) \
        .add(FeatureHasher() \
                .setSelectedCols(selectedColNames) \
                .setCategoricalCols(categoryColNames) \
                .setOutputCol(vecColName) \
                .setNumFeatures(nameHashFeatures))
# fit and save feature pipeline model
FEATURE_PIPELINE_MODEL_FILE = os.path.join(os.getcwd(), "feature_pipe_model.csv")
feature_pipeline.fit(trainBatchData).save(FEATURE_PIPELINE_MODEL_FILE);

BatchOperator.execute();
# load pipeline model
feature_pipelineModel = PipelineModel.load(FEATURE_PIPELINE_MODEL_FILE);


# train initial batch model
lr = LogisticRegressionTrainBatchOp()
initModel = lr.setVectorCol(vecColName) \
        .setLabelCol(labelColName) \
        .setWithIntercept(True) \
        .setMaxIter(10) \
        .linkFrom(feature_pipelineModel.transform(trainBatchData))




# ftrl train 
model = FtrlTrainStreamOp(initModel) \
        .setVectorCol(vecColName) \
        .setLabelCol(labelColName) \
        .setWithIntercept(True) \
        .setAlpha(0.1) \
        .setBeta(0.1) \
        .setL1(0.01) \
        .setL2(0.01) \
        .setTimeInterval(10) \
        .setVectorSize(nameHashFeatures) \
        .linkFrom(feature_pipelineModel.transform(train_stream_data))




# ftrl predict
predResult = FtrlPredictStreamOp(initModel) \
        .setVectorCol(vecColName) \
        .setPredictionCol("pred") \
        .setReservedCols([labelColName]) \
        .setPredictionDetailCol("details") \
        .linkFrom(model, feature_pipelineModel.transform(test_stream_data))

predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)


'DataStream predResult : ( Updated on 2020-04-02 16:10:50, #items received: 196530 )'
click pred details
0 0 0 {"0":"0.893231946299782","1":"0.10676805370021...
1 1 0 {"0":"0.745142214488233","1":"0.25485778551176...
2 1 0 {"0":"0.765632769073235","1":"0.23436723092676...
3 0 0 {"0":"0.8113419833506623","1":"0.1886580166493...
4 0 0 {"0":"0.929177522686554","1":"0.07082247731344...
5 1 0 {"0":"0.7859656548828632","1":"0.2140343451171...
6 0 0 {"0":"0.8559101947601475","1":"0.1440898052398...
7 0 0 {"0":"0.9007309902743751","1":"0.0992690097256...
8 0 0 {"0":"0.7747776539114233","1":"0.2252223460885...
9 0 0 {"0":"0.7113793792746559","1":"0.2886206207253...
10 0 0 {"0":"0.8067465417336181","1":"0.1932534582663...
11 0 0 {"0":"0.9386237136980374","1":"0.0613762863019...
12 0 0 {"0":"0.9188682855816503","1":"0.0811317144183...
13 0 0 {"0":"0.6924579330847471","1":"0.3075420669152...
14 0 0 {"0":"0.7393514879088229","1":"0.2606485120911...
15 1 0 {"0":"0.8084501207999263","1":"0.1915498792000...
16 0 0 {"0":"0.949887889053032","1":"0.05011211094696...
17 0 0 {"0":"0.7547176580812045","1":"0.2452823419187...
18 0 0 {"0":"0.5494833642638153","1":"0.4505166357361...
19 0 0 {"0":"0.9240806476231835","1":"0.0759193523768...
# ftrl eval
EvalBinaryClassStreamOp() \
        .setLabelCol(labelColName) \
        .setPredictionCol("pred") \
        .setPredictionDetailCol("details") \
        .setTimeInterval(10) \
        .linkFrom(predResult) \
        .link(JsonValueStreamOp() \
                .setSelectedCol("Data") \
                .setReservedCols(["Statistics"]) \
                .setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"]) \
                .setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])) \
                .print(key="evaluation", refreshInterval = 30, maxLimit=20)
StreamOperator.execute();
'DataStream evaluation : ( Updated on 2020-04-02 16:11:00, #items received: 93 )'
Statistics Accuracy AUC ConfusionMatrix
0 all 0.8208864412861355 0.7018136340935746 [[1740,1715],[32728,156114]]
1 window 0.8257520709199244 0.7048161223985424 [[37,53],[1146,5645]]
2 window 0.8118575185273726 0.7082768044875817 [[18,27],[760,3378]]
3 all 0.8206942182410424 0.7019777696316556 [[1758,1742],[33488,159492]]
4 window 0.8255558270217445 0.7054619366258229 [[21,13],[701,3358]]

你可能感兴趣的:(flink)