from pyalink.alink import *
Use one of the following commands to start using PyAlink:
- useLocalEnv(parallelism, flinkHome=None, config=None)
- useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.
useLocalEnv(2, flinkHome=None, config=None)
JVM listening on 127.0.0.1:38737
MLEnv(benv=JavaObject id=o2, btenv=JavaObject id=o5, senv=JavaObject id=o3, stenv=JavaObject id=o6)
source = CsvSourceBatchOp()\
.setSchemaStr("sepal_length double, sepal_width double, petal_length double, petal_width double, category string")\
.setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv")
res = source.select(["sepal_length", "sepal_width"])
df = res.collectToDataframe()
print(df)
sepal_length sepal_width
0 5.0 3.2
1 6.6 3.0
2 5.4 3.9
3 5.0 2.3
4 5.5 3.5
.. ... ...
145 6.4 2.9
146 6.3 2.5
147 5.8 2.6
148 5.7 4.4
149 6.5 3.0
[150 rows x 2 columns]
df.head(1)
|
sepal_length |
sepal_width |
0 |
5.0 |
3.2 |
from pyalink.alink import *
import sys, os
resetEnv()
useLocalEnv(2)
MLEnv(benv=JavaObject id=o178, btenv=JavaObject id=o181, senv=JavaObject id=o179, stenv=JavaObject id=o182)
schemaStr = "id string, click string, dt string, C1 string, banner_pos int, site_id string, \
site_domain string, site_category string, app_id string, app_domain string, \
app_category string, device_id string, device_ip string, device_model string, \
device_type string, device_conn_type string, C14 int, C15 int, C16 int, C17 int, \
C18 int, C19 int, C20 int, C21 int"
batchTrainDataFn = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-small.csv"
trainBatchData = CsvSourceBatchOp().setFilePath(batchTrainDataFn) \
.setSchemaStr(schemaStr) \
.setIgnoreFirstLine(True)
trainBatchData
labelColName = "click"
vecColName = "vec"
nameHashFeatures = 30000
selectedColNames =["C1","banner_pos","site_category","app_domain",
"app_category","device_type","device_conn_type",
"C14","C15","C16","C17","C18","C19","C20","C21",
"site_id","site_domain","device_id","device_model"]
categoryColNames = ["C1","banner_pos","site_category","app_domain",
"app_category","device_type","device_conn_type",
"site_id","site_domain","device_id","device_model"]
numericalColNames = [“C14”,“C15”,“C16”,“C17”,“C18”,“C19”,“C20”,“C21”]
wholeDataFile = "http://alink-release.oss-cn-beijing.aliyuncs.com/data-files/avazu-ctr-train-8M.csv"
data = CsvSourceStreamOp() \
.setFilePath(wholeDataFile) \
.setSchemaStr(schemaStr) \
.setIgnoreFirstLine(True);
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data)
train_stream_data = spliter
test_stream_data = spliter.getSideOutput(0)
feature_pipeline = Pipeline() \
.add(StandardScaler() \
.setSelectedCols(numericalColNames)) \
.add(FeatureHasher() \
.setSelectedCols(selectedColNames) \
.setCategoricalCols(categoryColNames) \
.setOutputCol(vecColName) \
.setNumFeatures(nameHashFeatures))
FEATURE_PIPELINE_MODEL_FILE = os.path.join(os.getcwd(), "feature_pipe_model.csv")
feature_pipeline.fit(trainBatchData).save(FEATURE_PIPELINE_MODEL_FILE);
BatchOperator.execute();
feature_pipelineModel = PipelineModel.load(FEATURE_PIPELINE_MODEL_FILE);
lr = LogisticRegressionTrainBatchOp()
initModel = lr.setVectorCol(vecColName) \
.setLabelCol(labelColName) \
.setWithIntercept(True) \
.setMaxIter(10) \
.linkFrom(feature_pipelineModel.transform(trainBatchData))
model = FtrlTrainStreamOp(initModel) \
.setVectorCol(vecColName) \
.setLabelCol(labelColName) \
.setWithIntercept(True) \
.setAlpha(0.1) \
.setBeta(0.1) \
.setL1(0.01) \
.setL2(0.01) \
.setTimeInterval(10) \
.setVectorSize(nameHashFeatures) \
.linkFrom(feature_pipelineModel.transform(train_stream_data))
predResult = FtrlPredictStreamOp(initModel) \
.setVectorCol(vecColName) \
.setPredictionCol("pred") \
.setReservedCols([labelColName]) \
.setPredictionDetailCol("details") \
.linkFrom(model, feature_pipelineModel.transform(test_stream_data))
predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)
'DataStream predResult : ( Updated on 2020-04-02 16:10:50, #items received: 196530 )'
|
click |
pred |
details |
0 |
0 |
0 |
{"0":"0.893231946299782","1":"0.10676805370021... |
1 |
1 |
0 |
{"0":"0.745142214488233","1":"0.25485778551176... |
2 |
1 |
0 |
{"0":"0.765632769073235","1":"0.23436723092676... |
3 |
0 |
0 |
{"0":"0.8113419833506623","1":"0.1886580166493... |
4 |
0 |
0 |
{"0":"0.929177522686554","1":"0.07082247731344... |
5 |
1 |
0 |
{"0":"0.7859656548828632","1":"0.2140343451171... |
6 |
0 |
0 |
{"0":"0.8559101947601475","1":"0.1440898052398... |
7 |
0 |
0 |
{"0":"0.9007309902743751","1":"0.0992690097256... |
8 |
0 |
0 |
{"0":"0.7747776539114233","1":"0.2252223460885... |
9 |
0 |
0 |
{"0":"0.7113793792746559","1":"0.2886206207253... |
10 |
0 |
0 |
{"0":"0.8067465417336181","1":"0.1932534582663... |
11 |
0 |
0 |
{"0":"0.9386237136980374","1":"0.0613762863019... |
12 |
0 |
0 |
{"0":"0.9188682855816503","1":"0.0811317144183... |
13 |
0 |
0 |
{"0":"0.6924579330847471","1":"0.3075420669152... |
14 |
0 |
0 |
{"0":"0.7393514879088229","1":"0.2606485120911... |
15 |
1 |
0 |
{"0":"0.8084501207999263","1":"0.1915498792000... |
16 |
0 |
0 |
{"0":"0.949887889053032","1":"0.05011211094696... |
17 |
0 |
0 |
{"0":"0.7547176580812045","1":"0.2452823419187... |
18 |
0 |
0 |
{"0":"0.5494833642638153","1":"0.4505166357361... |
19 |
0 |
0 |
{"0":"0.9240806476231835","1":"0.0759193523768... |
EvalBinaryClassStreamOp() \
.setLabelCol(labelColName) \
.setPredictionCol("pred") \
.setPredictionDetailCol("details") \
.setTimeInterval(10) \
.linkFrom(predResult) \
.link(JsonValueStreamOp() \
.setSelectedCol("Data") \
.setReservedCols(["Statistics"]) \
.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"]) \
.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])) \
.print(key="evaluation", refreshInterval = 30, maxLimit=20)
StreamOperator.execute();
'DataStream evaluation : ( Updated on 2020-04-02 16:11:00, #items received: 93 )'
|
Statistics |
Accuracy |
AUC |
ConfusionMatrix |
0 |
all |
0.8208864412861355 |
0.7018136340935746 |
[[1740,1715],[32728,156114]] |
1 |
window |
0.8257520709199244 |
0.7048161223985424 |
[[37,53],[1146,5645]] |
2 |
window |
0.8118575185273726 |
0.7082768044875817 |
[[18,27],[760,3378]] |
3 |
all |
0.8206942182410424 |
0.7019777696316556 |
[[1758,1742],[33488,159492]] |
4 |
window |
0.8255558270217445 |
0.7054619366258229 |
[[21,13],[701,3358]] |