这篇博客打算说一说,Trident-ml(基于Storm的机器学习算法包)里面的一个分类算法,不知道Online Passive-Aggressive Algorithms(论文连接:http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf)该翻译成什么更为合适,后面直接简称PA。
下面以二类分类器为例,这里规定样本(或者成为实例)的类别为+1或者-1,我们可以将在线分类算法描述成以下过程:给定实例 xt ,以及该实例的类别 yt ,找出一个合适的权值向量 w ,使用 sign(w∙x) 。表示分类器预测的值。很显然,当 ytsign(wt∙xt)>0 是我们最最希望见到的,因为此时的分类器输出是正确的,此时的我们更希望把属于 {+1,−1} 的实例在w这个平面上分离,这一点有点像支持向量机,因此这里可以定义如下损失函数:
wt+1=argminw∈Rn12||w−wt||2 s.t.l(w;(xt,yt))=0
1 若误差函数为0,及分类正确则权值向量不更新
2 若误差函数大于0,这个时候就要使用拉格朗日数乘法求解最优值了:
∂Loss∂w=w−wt−λyx =>ywxT−ywtxT−λyyxxT=0 =>ywxT−ywtxT−λ||x||2=0 =>1−ywtxT=λ||x||2 =>λ=lt||x||2
wt+1=argminw∈Rn12||w−wt||2+Cξ s.t.l(w;(xt,yt))≤ξandζ≥0
wt+1=argminw∈Rn12||w−wt||2+Cξ2 s.t.l(w;(xt,yt))≤ξ
package com.github.pmerienne.trident.ml.util;
import org.jblas.DoubleMatrix;
public class MathUtil {
public static boolean isZeros(DoubleMatrix matrix) {//判断是否是0矩阵
boolean zeroVector = true;
for (int i = 0; i < matrix.rows; i++) {
for (int j = 0; j < matrix.columns; j++) {
if (matrix.get(i, j) != 0.0) {
zeroVector = false;
return zeroVector;
public static double dot(double[] vector1, double[] vector2) {//向量内积
if (vector1.length != vector2.length) {
throw new IllegalArgumentException("The dimensions have to be equal!");
double sum = 0;
for (int i = 0; i < vector1.length; i++) {
sum += vector1[i] * vector2[i];
return sum;
public static Double norm(double[] vector) {//向量的模
double meanSqrd = 0;
for (int i = 0; i < vector.length; i++) {
meanSqrd += vector[i] * vector[i];
return Math.sqrt(meanSqrd);
public static double[] mult(double[] vector, double scalar) {//向量数乘
int length = vector.length;
double[] result = new double[length];
for (int i = 0; i < length; i++) {
result[i] = vector[i] * scalar;
return result;
public static double[] add(double[] vector1, double[] vector2) {//向量之和
if (vector1.length != vector2.length) {
throw new IllegalArgumentException("The dimensions have to be equal!");
double[] result = new double[vector1.length];
assert vector1.length == vector2.length;
for (int i = 0; i < vector1.length; i++) {
result[i] = vector1[i] + vector2[i];
return result;
public static double[] subtract(double[] vector1, double[] vector2) {//向量相减
if (vector1.length != vector2.length) {
throw new IllegalArgumentException("The dimensions have to be equal!");
double[] result = new double[vector1.length];
assert vector1.length == vector2.length;
for (int i = 0; i < vector1.length; i++) {
result[i] = vector1[i] - vector2[i];
return result;
public static double euclideanDistance(double[] a, double[] b) {//向量的欧式距离
if (a.length != b.length) {
throw new IllegalArgumentException("The dimensions have to be equal!");
double sum = 0.0;
for (int i = 0; i < a.length; i++) {
sum += Math.pow(a[i] - b[i], 2);
return Math.sqrt(sum);
public static double[] normalize(double[] vector) {//向量标准化
double magnitude = magnitude(vector);
return magnitude != 0 ? mult(vector, 1 / magnitude) : vector;
public static double magnitude(double[] vector) {
double magnitude = 0.0;
for (int i = 0; i < vector.length; i++) {
magnitude += Math.pow(vector[i], 2);
return Math.sqrt(magnitude);
package com.github.pmerienne.trident.ml.classification;
import com.github.pmerienne.trident.ml.util.MathUtil;
/** * Passive-Aggresive binary classifier. * * @see Online Passive-Aggressive Algorithms * * Koby Crammer, Ofer Dekel, Joseph Keshet, Shai Shalev-Shwartz, Yoram * Singer; 7(Mar):551--585, 2006. * @author pmerienne * */
public class PAClassifier implements Classifier<Boolean> {
private static final long serialVersionUID = -5163481593640555140L;
private double[] weights;
private Type type = Type.STANDARD;
private Double aggressiveness = 0.001;
public PAClassifier() {
public PAClassifier(Type type) {
this.type = type;
public PAClassifier(Type type, Double aggressiveness) {
this.type = type;
this.aggressiveness = aggressiveness;
public Boolean classify(double[] features) {
if (this.weights == null) {
Double evaluation = MathUtil.dot(features, this.weights); //以0位分类阀值
Boolean prediction = evaluation >= 0 ? Boolean.TRUE : Boolean.FALSE;
return prediction;
public void update(Boolean expectedLabel, double[] features) {
if (this.weights == null) {
Double expectedLabelAsInt = expectedLabel ? 1.0 : -1.0;
//分类错误,则 y*w_t*x<0
//还是在线学习 max(0, 1- y*f(x) ) == max(0, 1-y*w_t*x) hinge-loss function
double loss = Math.max(0.0, 1 - (expectedLabelAsInt * MathUtil.dot(this.weights, features)));
double update = 0;
if (Type.STANDARD.equals(this.type)) {
update = loss / (1 + Math.pow(MathUtil.norm(features), 2)); //PA1 lamda
} else if (Type.PA1.equals(this.type)) {
update = Math.min(this.aggressiveness, loss / Math.pow(MathUtil.norm(features), 2));//PA2 lamda
} else if (Type.PA2.equals(this.type)) {
update = loss / (Math.pow(MathUtil.norm(features), 2) + (1.0 / (2 * this.aggressiveness)));//PA2 lamda
double[] scaledFeatures = MathUtil.mult(features, update * expectedLabelAsInt); // lamda * y(t) * x(t)
this.weights = MathUtil.add(this.weights, scaledFeatures);//权值更新 w(t+1) = w(t) + lamda * y(t) * x(t)
protected void init(int featureSize) {
// Init weights
this.weights = new double[featureSize];
public void reset() {
this.weights = null;
public double[] getWeights() {
return weights;
public void setWeights(double[] weights) {
this.weights = weights;
public Type getType() {
return type;
public void setType(Type type) {
this.type = type;
public Double getAggressiveness() {
return aggressiveness;
public void setAggressiveness(Double aggressiveness) {
this.aggressiveness = aggressiveness;
public String toString() {
return "PAClassifier [type=" + type + ", aggressiveness=" + aggressiveness + "]";
public static enum Type {
package com.zhangluoyang.experiment;
import backtype.storm.Config;
import backtype.storm.ILocalDRPC;
import backtype.storm.LocalCluster;
import backtype.storm.LocalDRPC;
import backtype.storm.drpc.DRPCSpout;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.tuple.Fields;
import com.github.pmerienne.trident.ml.classification.ClassifierUpdater;
import com.github.pmerienne.trident.ml.classification.ClassifyQuery;
import com.github.pmerienne.trident.ml.classification.PAClassifier;
import com.github.pmerienne.trident.ml.classification.PerceptronClassifier;
import com.github.pmerienne.trident.ml.testing.NANDSpout;
import storm.trident.TridentState;
import storm.trident.TridentTopology;
import storm.trident.testing.MemoryMapState;
// com.zhangluoyang.experiment.MlExample
public class MlExample {
public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {
// TODO Auto-generated method stub
TridentTopology toppology = new TridentTopology();
// Create perceptron state from labeled instances stream
TridentState perceptronModel = toppology
// Emit tuple with a labeled instance of enhanced NAND features
// i.e. : {label=true, features=[1.0 0.0 1.0]} or {label=false, features=[1.0 1.0 1.0]}
.newStream("nandsamples", new NANDSpout())
// Update perceptron
.partitionPersist(new MemoryMapState.Factory(), new Fields("instance"), new ClassifierUpdater<Boolean>("PAclassification", new PAClassifier()));
// Classify instance from a DRPC stream
// Transform DRPC ARGS to unlabeled instance
.each(new Fields("args"), new DRPCArgsToInstance(), new Fields("instance"))
// Classify instance using perceptron state
.stateQuery(perceptronModel, new Fields("instance"), new ClassifyQuery<Boolean>("PAclassification"), new Fields("prediction"));
String word = "1.0,0.0,1.0";
Config conf = new Config();
StormSubmitter stormSubmitter = new StormSubmitter();
stormSubmitter.submitTopology("LearningStormToplogy", conf, toppology.build());
package com.zhangluoyang.experiment;
import org.apache.thrift7.TException;
import backtype.storm.generated.DRPCExecutionException;
import backtype.storm.utils.DRPCClient;
public class MLclient {
public static void main(String[] args) throws TException, DRPCExecutionException {
// TODO Auto-generated method stub
DRPCClient client = new DRPCClient("localhost", 3772);
String result = client.execute("predict", "1.0,1.0,1.0");